diff --git a/.gitattributes b/.gitattributes index d47ac04a3b4c8922810df77cab6f0394dbe9bdb7..15caeca0559652cfd179862d561abade06f2740b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -36,3 +36,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text asset/banner.png filter=lfs diff=lfs merge=lfs -text docs/resources/web-ui.jpg filter=lfs diff=lfs merge=lfs -text docs/resources/dpo_data.png filter=lfs diff=lfs merge=lfs -text +docs/transformers/tests/fixtures/tests_samples/COCO/000000039769.png filter=lfs diff=lfs merge=lfs -text +docs/transformers/tests/fixtures/tests_samples/COCO/000000004016.png filter=lfs diff=lfs merge=lfs -text diff --git a/docs/transformers/tests/fixtures/spiece.model b/docs/transformers/tests/fixtures/spiece.model new file mode 100644 index 0000000000000000000000000000000000000000..64f3146c1fe2a7b57b19725c0627bb6b66ea0da0 --- /dev/null +++ b/docs/transformers/tests/fixtures/spiece.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fefb02b667a6c5c2fe27602d28e5fb3428f66ab89c7d6f388e7c8d44a02d0336 +size 760289 diff --git a/docs/transformers/tests/fixtures/test_sentencepiece.model b/docs/transformers/tests/fixtures/test_sentencepiece.model new file mode 100644 index 0000000000000000000000000000000000000000..c93fabdc0d8840e28baff407ec1a048eff8abc23 --- /dev/null +++ b/docs/transformers/tests/fixtures/test_sentencepiece.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dfd1eae4522281b1b839eab877a791befec7a1663a41c814c77d9c89c748f2d +size 253154 diff --git a/docs/transformers/tests/fixtures/test_sentencepiece_bpe.model b/docs/transformers/tests/fixtures/test_sentencepiece_bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..4f7197401c9d0863e92f645dfe04b4447c4b431f --- /dev/null +++ b/docs/transformers/tests/fixtures/test_sentencepiece_bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4de78f5d11ee09141165d31da7dad97e809dd6ee7b52a0cbc6d76a973028286 +size 251527 diff --git a/docs/transformers/tests/fixtures/test_sentencepiece_bpe_char.model b/docs/transformers/tests/fixtures/test_sentencepiece_bpe_char.model new file mode 100644 index 0000000000000000000000000000000000000000..8fb73691942626fa75df80b61aab0e9b9340d8e2 --- /dev/null +++ b/docs/transformers/tests/fixtures/test_sentencepiece_bpe_char.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560 +size 238473 diff --git a/docs/transformers/tests/fixtures/test_sentencepiece_no_bos.model b/docs/transformers/tests/fixtures/test_sentencepiece_no_bos.model new file mode 100644 index 0000000000000000000000000000000000000000..3821900071a060c06044cc43c30519faedcd1b38 --- /dev/null +++ b/docs/transformers/tests/fixtures/test_sentencepiece_no_bos.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f3af97c2e7bc51d781e7440aa33deee7f482eac819d23fd24af80e7b4ce2646 +size 253134 diff --git a/docs/transformers/tests/fixtures/test_sentencepiece_with_bytefallback.model b/docs/transformers/tests/fixtures/test_sentencepiece_with_bytefallback.model new file mode 100644 index 0000000000000000000000000000000000000000..fbf70ee25f893394e33498e3c8969288cc29d5cd --- /dev/null +++ b/docs/transformers/tests/fixtures/test_sentencepiece_with_bytefallback.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c61ecce43369fc3bab9566464f0e71f3ad75dc2319a5aadc2a561e3e312502e3 +size 270096 diff --git a/docs/transformers/tests/fixtures/tests_samples/COCO/000000004016.png b/docs/transformers/tests/fixtures/tests_samples/COCO/000000004016.png new file mode 100644 index 0000000000000000000000000000000000000000..698ce0e1b88f5d7675c33185d7c1a698e0ec3a18 --- /dev/null +++ b/docs/transformers/tests/fixtures/tests_samples/COCO/000000004016.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88bb5c2a7f00292db072f7d95638c6f5a46c7d824fff6412327e636eedc1fc24 +size 636411 diff --git a/docs/transformers/tests/fixtures/tests_samples/COCO/000000039769.png b/docs/transformers/tests/fixtures/tests_samples/COCO/000000039769.png new file mode 100644 index 0000000000000000000000000000000000000000..92f01f8b50fc6d714cd42d1441a637f7aa864127 --- /dev/null +++ b/docs/transformers/tests/fixtures/tests_samples/COCO/000000039769.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf6f3c4befa148732c7453e0de5afab00f682427435fead2d88b07a9615cdac2 +size 694498 diff --git a/docs/transformers/tests/models/byt5/__init__.py b/docs/transformers/tests/models/byt5/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/byt5/test_tokenization_byt5.py b/docs/transformers/tests/models/byt5/test_tokenization_byt5.py new file mode 100644 index 0000000000000000000000000000000000000000..519d83694ac1c9bb72acdf87363fbf24e4aa8988 --- /dev/null +++ b/docs/transformers/tests/models/byt5/test_tokenization_byt5.py @@ -0,0 +1,366 @@ +# Copyright 2020 Google T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import re +import shutil +import tempfile +import unittest +from functools import lru_cache + +from transformers import AddedToken, BatchEncoding, ByT5Tokenizer +from transformers.utils import cached_property, is_tf_available, is_torch_available + +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible + + +if is_torch_available(): + FRAMEWORK = "pt" +elif is_tf_available(): + FRAMEWORK = "tf" +else: + FRAMEWORK = "jax" + + +class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + tokenizer_class = ByT5Tokenizer + test_rust_tokenizer = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + tokenizer = ByT5Tokenizer() + tokenizer.save_pretrained(cls.tmpdirname) + + @cached_property + def t5_base_tokenizer(self): + return ByT5Tokenizer.from_pretrained("google/byt5-small") + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> ByT5Tokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> tuple[str, list]: + # XXX The default common tokenizer tests assume that every ID is decodable on its own. + # This assumption is invalid for ByT5 because single bytes might not be + # valid utf-8 (byte 128 for instance). + # Here we're overriding the smallest possible method to provide + # a clean sequence without making the same assumption. + + toks = [] + for i in range(len(tokenizer)): + try: + tok = tokenizer.decode([i], clean_up_tokenization_spaces=False) + except UnicodeDecodeError: + pass + toks.append((i, tok)) + + toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks)) + toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks)) + if max_length is not None and len(toks) > max_length: + toks = toks[:max_length] + if min_length is not None and len(toks) < min_length and len(toks) > 0: + while len(toks) < min_length: + toks = toks + toks + # toks_str = [t[1] for t in toks] + toks_ids = [t[0] for t in toks] + + # Ensure consistency + output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False) + if " " not in output_txt and len(toks_ids) > 1: + output_txt = ( + tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False) + + " " + + tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False) + ) + if with_prefix_space: + output_txt = " " + output_txt + output_ids = tokenizer.encode(output_txt, add_special_tokens=False) + return output_txt, output_ids + + def test_eos_treatment(self): + tokenizer = self.t5_base_tokenizer + batch_with_eos_added = tokenizer(["hi", "I went to the gym", ""]) + batch_without_eos_added = tokenizer(["hi", "I went to the gym", ""]) + self.assertListEqual(batch_with_eos_added["input_ids"], batch_without_eos_added["input_ids"]) + + def test_multibytes_char(self): + tokenizer = self.t5_base_tokenizer + src_text = "Unicode €." + encoded = tokenizer(src_text) + encoded_ids = [88, 113, 108, 102, 114, 103, 104, 35, 229, 133, 175, 49, 1] + self.assertEqual(encoded["input_ids"], encoded_ids) + + # decoding + decoded = tokenizer.decode(encoded_ids) + self.assertEqual(decoded, "Unicode €.") + + encoded = tokenizer("e è é ê ë") + encoded_ids = [104, 35, 198, 171, 35, 198, 172, 35, 198, 173, 35, 198, 174, 1] + self.assertEqual(encoded["input_ids"], encoded_ids) + # decoding + decoded = tokenizer.decode(encoded_ids) + self.assertEqual(decoded, "e è é ê ë") + + # encode/decode, but with `encode` instead of `__call__` + self.assertEqual(tokenizer.decode(tokenizer.encode("e è é ê ë")), "e è é ê ë") + + def test_prepare_batch_integration(self): + tokenizer = self.t5_base_tokenizer + src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] + expected_src_tokens = [68, 35, 111, 114, 113, 106, 35, 115, 100, 117, 100, 106, 117, 100, 115, 107, 35, 105, 114, 117, 35, 118, 120, 112, 112, 100, 117, 108, 125, 100, 119, 108, 114, 113, 49, 1, 0] # fmt: skip + batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK) + self.assertIsInstance(batch, BatchEncoding) + + if FRAMEWORK != "jax": + result = list(batch.input_ids.numpy()[0]) + else: + result = list(batch.input_ids.tolist()[0]) + + self.assertListEqual(expected_src_tokens, result) + + self.assertEqual((2, 37), batch.input_ids.shape) + self.assertEqual((2, 37), batch.attention_mask.shape) + + def test_empty_target_text(self): + tokenizer = self.t5_base_tokenizer + src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] + batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK) + # check if input_ids are returned and no decoder_input_ids + self.assertIn("input_ids", batch) + self.assertIn("attention_mask", batch) + self.assertNotIn("decoder_input_ids", batch) + self.assertNotIn("decoder_attention_mask", batch) + + def test_max_length_integration(self): + tokenizer = self.t5_base_tokenizer + tgt_text = [ + "Summary of the text.", + "Another summary.", + ] + targets = tokenizer( + text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK + ) + self.assertEqual(32, targets["input_ids"].shape[1]) + + def test_eos_in_input(self): + tokenizer = self.t5_base_tokenizer + src_text = ["A long paragraph for summarization. "] + tgt_text = ["Summary of the text. "] + expected_src_tokens = [68, 35, 111, 114, 113, 106, 35, 115, 100, 117, 100, 106, 117, 100, 115, 107, 35, 105, 114, 117, 35, 118, 120, 112, 112, 100, 117, 108, 125, 100, 119, 108, 114, 113, 49, 35, 1] # fmt: skip + expected_tgt_tokens = [86, 120, 112, 112, 100, 117, 124, 35, 114, 105, 35, 119, 107, 104, 35, 119, 104, 123, 119, 49, 35, 1] # fmt: skip + + batch = tokenizer(src_text, text_target=tgt_text) + + self.assertEqual(expected_src_tokens, batch["input_ids"][0]) + self.assertEqual(expected_tgt_tokens, batch["labels"][0]) + + # cannot use default save_and_load_tokenizer test method because tokenizer has no vocab + def test_save_and_load_tokenizer(self): + # safety check on max_len default value so we are sure the test works + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + self.assertNotEqual(tokenizer.model_max_length, 42) + + # Now let's start the test + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Isolate this from the other tests because we save additional tokens/etc + tmpdirname = tempfile.mkdtemp() + + sample_text = " He is very happy, UNwant\u00e9d,running" + before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) + tokenizer.save_pretrained(tmpdirname) + + after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) + after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False) + self.assertListEqual(before_tokens, after_tokens) + + shutil.rmtree(tmpdirname) + + tokenizers = self.get_tokenizers(model_max_length=42) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Isolate this from the other tests because we save additional tokens/etc + tmpdirname = tempfile.mkdtemp() + + sample_text = " He is very happy, UNwant\u00e9d,running" + tokenizer.add_tokens(["bim", "bambam"]) + additional_special_tokens = tokenizer.additional_special_tokens + additional_special_tokens.append("new_additional_special_token") + tokenizer.add_special_tokens( + {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False + ) + before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) + tokenizer.save_pretrained(tmpdirname) + + after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) + after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False) + self.assertListEqual(before_tokens, after_tokens) + self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens) + self.assertEqual(after_tokenizer.model_max_length, 42) + + tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43) + self.assertEqual(tokenizer.model_max_length, 43) + + shutil.rmtree(tmpdirname) + + # There is a conflict between the default value of extra_ids and adding a new special token through additional_special_tokens + # We need to add the extra_ids in the list of the arg additional_special_tokens + def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self): + tokenizer_list = [] + if self.test_slow_tokenizer: + tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) + + if self.test_rust_tokenizer: + tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) + + for tokenizer_class, tokenizer_utils in tokenizer_list: + with tempfile.TemporaryDirectory() as tmp_dir: + tokenizer_utils.save_pretrained(tmp_dir) + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file: + special_tokens_map = json.load(json_file) + + with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file: + tokenizer_config = json.load(json_file) + + added_tokens_extra_ids = [f"" for i in range(125)] + + special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [ + "an_additional_special_token" + ] + tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [ + "an_additional_special_token" + ] + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile: + json.dump(special_tokens_map, outfile) + with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile: + json.dump(tokenizer_config, outfile) + + # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes + # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and + # "special_tokens_map.json" files + tokenizer_without_change_in_init = tokenizer_class.from_pretrained( + tmp_dir, + ) + self.assertIn( + "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens + ) + # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab + self.assertEqual( + ["an_additional_special_token"], + tokenizer_without_change_in_init.convert_ids_to_tokens( + tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"]) + ), + ) + + # Now we test that we can change the value of additional_special_tokens in the from_pretrained + new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)] + tokenizer = tokenizer_class.from_pretrained( + tmp_dir, + additional_special_tokens=new_added_tokens, + ) + + self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens) + self.assertEqual( + ["a_new_additional_special_token"], + tokenizer.convert_ids_to_tokens( + tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"]) + ), + ) + + def test_decode_single_bytes(self): + tokenizer_list = [] + if self.test_slow_tokenizer: + tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) + + if self.test_rust_tokenizer: + tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) + + for tokenizer_class, tokenizer_utils in tokenizer_list: + with tempfile.TemporaryDirectory() as tmp_dir: + tokenizer_utils.save_pretrained(tmp_dir) + + tokenizer = tokenizer_class.from_pretrained(tmp_dir) + + self.assertTrue(tokenizer.decode([255]) == "") + + @unittest.skip(reason="ByT5Tokenizer does not have a vocabulary") + def test_get_vocab(self): + pass + + @unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string") + def test_pretokenized_inputs(self): + pass + + @unittest.skip(reason="ByT5Tokenizer does not have a vocabulary") + def test_conversion_reversible(self): + pass + + def test_convert_tokens_to_string_format(self): + # The default common tokenizer tests uses invalid tokens for ByT5 that can only accept one-character strings + # and special added tokens as tokens + tokenizers = self.get_tokenizers(fast=True, do_lower_case=True) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + tokens = ["t", "h", "i", "s", " ", "i", "s", " ", "a", " ", "t", "e", "x", "t", ""] + string = tokenizer.convert_tokens_to_string(tokens) + + self.assertIsInstance(string, str) + + # We need a different implementation of the test of the same name defined in TokenizerTesterMixin because this tokenizer + # doesn't have a vocab + def test_tokenizers_common_ids_setters(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + attributes_list = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + ] + + token_id_to_test_setters = 0 + token_to_test_setters = tokenizer.convert_ids_to_tokens( + token_id_to_test_setters, skip_special_tokens=False + ) + + for attr in attributes_list: + setattr(tokenizer, attr + "_id", None) + self.assertEqual(getattr(tokenizer, attr), None) + self.assertEqual(getattr(tokenizer, attr + "_id"), None) + + setattr(tokenizer, attr + "_id", token_id_to_test_setters) + self.assertEqual(getattr(tokenizer, attr), token_to_test_setters) + self.assertEqual(getattr(tokenizer, attr + "_id"), token_id_to_test_setters) + + setattr(tokenizer, "additional_special_tokens_ids", []) + self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), []) + self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), []) + + setattr(tokenizer, "additional_special_tokens_ids", [token_id_to_test_setters]) + self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters]) + self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters]) diff --git a/docs/transformers/tests/models/camembert/__init__.py b/docs/transformers/tests/models/camembert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/camembert/test_modeling_tf_camembert.py b/docs/transformers/tests/models/camembert/test_modeling_tf_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..f9f8ba61d0e58bef3147b0935b7b365a8ad38963 --- /dev/null +++ b/docs/transformers/tests/models/camembert/test_modeling_tf_camembert.py @@ -0,0 +1,55 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +from transformers import is_tf_available +from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow + + +if is_tf_available(): + import numpy as np + import tensorflow as tf + + from transformers import TFCamembertModel + + +@require_tf +@require_sentencepiece +@require_tokenizers +class TFCamembertModelIntegrationTest(unittest.TestCase): + @slow + def test_output_embeds_base_model(self): + model = TFCamembertModel.from_pretrained("jplu/tf-camembert-base") + + input_ids = tf.convert_to_tensor( + [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], + dtype=tf.int32, + ) # J'aime le camembert !" + + output = model(input_ids)["last_hidden_state"] + expected_shape = tf.TensorShape((1, 10, 768)) + self.assertEqual(output.shape, expected_shape) + # compare the actual values for a slice. + expected_slice = tf.convert_to_tensor( + [[[-0.0254, 0.0235, 0.1027], [0.0606, -0.1811, -0.0418], [-0.1561, -0.1127, 0.2687]]], + dtype=tf.float32, + ) + # camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0') + # camembert.eval() + # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach() + + self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4)) diff --git a/docs/transformers/tests/models/camembert/test_tokenization_camembert.py b/docs/transformers/tests/models/camembert/test_tokenization_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..33a49b33958ef8ccb43eeb6c27bf0d4eb096891f --- /dev/null +++ b/docs/transformers/tests/models/camembert/test_tokenization_camembert.py @@ -0,0 +1,220 @@ +# Copyright 2018 HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +import unittest +from tempfile import TemporaryDirectory + +from transformers import AddedToken, CamembertTokenizer, CamembertTokenizerFast +from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow +from transformers.utils import is_torch_available + +from ...test_tokenization_common import TokenizerTesterMixin + + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") +SAMPLE_BPE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model") + +FRAMEWORK = "pt" if is_torch_available() else "tf" + + +@require_sentencepiece +@require_tokenizers +class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "almanach/camembert-base" + tokenizer_class = CamembertTokenizer + rust_tokenizer_class = CamembertTokenizerFast + test_rust_tokenizer = True + test_sentencepiece = True + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # We have a SentencePiece fixture for testing + tokenizer = CamembertTokenizer(SAMPLE_VOCAB) + tokenizer.save_pretrained(cls.tmpdirname) + + @unittest.skip( + "Token maps are not equal because someone set the probability of ('NOTUSED', -100), so it's never encoded for fast" + ) + def test_special_tokens_map_equal(self): + return + + def test_convert_token_and_id(self): + """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" + token = "" + token_id = 1 # 1 is the offset id, but in the spm vocab it's 3 + + self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id) + self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token) + + def test_get_vocab(self): + vocab_keys = list(self.get_tokenizer().get_vocab().keys()) + + self.assertEqual(vocab_keys[0], "NOTUSED") + self.assertEqual(vocab_keys[1], "") + self.assertEqual(vocab_keys[-1], "") + self.assertEqual(len(vocab_keys), 1_005) + + def test_vocab_size(self): + self.assertEqual(self.get_tokenizer().vocab_size, 1_000) + + def test_rust_and_python_bpe_tokenizers(self): + tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB) + with TemporaryDirectory() as tmpdirname: + tokenizer.save_pretrained(tmpdirname) + rust_tokenizer = CamembertTokenizerFast.from_pretrained(tmpdirname) + + sequence = "I was born in 92000, and this is falsé." + + ids = tokenizer.encode(sequence) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + ids = tokenizer.encode(sequence, add_special_tokens=False) + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + # tokens are not the same for `rust` than for `slow`. + # Because spm gives back raw token instead of `unk` in EncodeAsPieces + # tokens = tokenizer.tokenize(sequence) + tokens = tokenizer.convert_ids_to_tokens(ids) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(tokens, rust_tokens) + + def test_rust_and_python_full_tokenizers(self): + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + sequence = "I was born in 92000, and this is falsé." + + tokens = tokenizer.tokenize(sequence) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(tokens, rust_tokens) + + ids = tokenizer.encode(sequence, add_special_tokens=False) + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + rust_tokenizer = self.get_rust_tokenizer() + ids = tokenizer.encode(sequence) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + @slow + def test_tokenizer_integration(self): + expected_encoding = {'input_ids': [[5, 54, 7196, 297, 30, 23, 776, 18, 11, 3215, 3705, 8252, 22, 3164, 1181, 2116, 29, 16, 813, 25, 791, 3314, 20, 3446, 38, 27575, 120, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [5, 468, 17, 11, 9088, 20, 1517, 8, 22804, 18818, 10, 38, 629, 607, 607, 142, 19, 7196, 867, 56, 10326, 24, 2267, 20, 416, 5072, 15612, 233, 734, 7, 2399, 27, 16, 3015, 1649, 7, 24, 20, 4338, 2399, 27, 13, 3400, 14, 13, 6189, 8, 930, 9, 6]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip + + # camembert is a french model. So we also use french texts. + sequences = [ + "Le transformeur est un modèle d'apprentissage profond introduit en 2017, " + "utilisé principalement dans le domaine du traitement automatique des langues (TAL).", + "À l'instar des réseaux de neurones récurrents (RNN), les transformeurs sont conçus " + "pour gérer des données séquentielles, telles que le langage naturel, pour des tâches " + "telles que la traduction et la synthèse de texte.", + ] + + self.tokenizer_integration_test_util( + expected_encoding=expected_encoding, + model_name="almanach/camembert-base", + revision="3a0641d9a1aeb7e848a74299e7e4c4bca216b4cf", + sequences=sequences, + ) + + # Overwritten because we have to use from slow (online pretrained is wrong, the tokenizer.json has a whole) + def test_added_tokens_serialization(self): + self.maxDiff = None + + # Utility to test the added vocab + def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir): + tokenizer = tokenizer_class.from_pretrained(temp_dir) + self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens) + self.assertIn(new_eos, tokenizer.added_tokens_decoder.values()) + self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos) + self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items())) + return tokenizer + + new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True) + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + # Load a slow tokenizer from the hub, init with the new token for fast to also include it + tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos) + EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder + with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"): + self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos) + self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values())) + + with tempfile.TemporaryDirectory() as tmp_dir_2: + tokenizer.save_pretrained(tmp_dir_2) + with self.subTest( + "Hub -> Slow -> Slow: Test saving this slow tokenizer and reloading it in the fast class" + ): + _test_added_vocab_and_eos( + EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_2 + ) + + if self.rust_tokenizer_class is not None: + with self.subTest( + "Hub -> Slow -> Fast: Test saving this slow tokenizer and reloading it in the fast class" + ): + tokenizer_fast = _test_added_vocab_and_eos( + EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_2 + ) + with tempfile.TemporaryDirectory() as tmp_dir_3: + tokenizer_fast.save_pretrained(tmp_dir_3) + with self.subTest( + "Hub -> Slow -> Fast -> Fast: Test saving this fast tokenizer and reloading it in the fast class" + ): + _test_added_vocab_and_eos( + EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3 + ) + + with self.subTest( + "Hub -> Slow -> Fast -> Slow: Test saving this slow tokenizer and reloading it in the slow class" + ): + _test_added_vocab_and_eos( + EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3 + ) + + with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"): + if self.rust_tokenizer_class is not None: + tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos, from_slow=True) + self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos) + self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) + # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright + with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): + with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): + self.assertTrue( + all( + item in tokenizer.added_tokens_decoder.items() + for item in EXPECTED_ADDED_TOKENS_DECODER.items() + ) + ) + + EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder + with tempfile.TemporaryDirectory() as tmp_dir_4: + tokenizer_fast.save_pretrained(tmp_dir_4) + with self.subTest("Hub -> Fast -> Fast: saving Fast1 locally and loading"): + _test_added_vocab_and_eos( + EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_4 + ) + + with self.subTest("Hub -> Fast -> Slow: saving Fast1 locally and loading"): + _test_added_vocab_and_eos( + EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_4 + ) diff --git a/docs/transformers/tests/models/canine/__init__.py b/docs/transformers/tests/models/canine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/canine/test_modeling_canine.py b/docs/transformers/tests/models/canine/test_modeling_canine.py new file mode 100644 index 0000000000000000000000000000000000000000..94becd11af6e91e5d956d5b813cb787cae4df26f --- /dev/null +++ b/docs/transformers/tests/models/canine/test_modeling_canine.py @@ -0,0 +1,571 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch CANINE model.""" + +import unittest + +from transformers import CanineConfig, is_torch_available +from transformers.testing_utils import require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, global_rng, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + CanineForMultipleChoice, + CanineForQuestionAnswering, + CanineForSequenceClassification, + CanineForTokenClassification, + CanineModel, + ) + + +class CanineModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + # let's use a vocab size that's way bigger than BERT's one + # NOTE: this is not a model parameter, just an input + vocab_size=100000, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + num_hash_buckets=16, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.num_hash_buckets = num_hash_buckets + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor(input_ids.shape, self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return CanineConfig( + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + num_hash_buckets=self.num_hash_buckets, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = CanineModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = CanineForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = CanineForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = CanineForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = CanineForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class CanineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + CanineModel, + CanineForMultipleChoice, + CanineForQuestionAnswering, + CanineForSequenceClassification, + CanineForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": CanineModel, + "question-answering": CanineForQuestionAnswering, + "text-classification": CanineForSequenceClassification, + "token-classification": CanineForTokenClassification, + "zero-shot": CanineForSequenceClassification, + } + if is_torch_available() + else {} + ) + + test_mismatched_shapes = False + test_resize_embeddings = False + test_pruning = False + + def setUp(self): + self.model_tester = CanineModelTester(self) + # we set has_text_modality to False as the config has no vocab_size attribute + self.config_tester = ConfigTester(self, config_class=CanineConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + # expected_num_layers equals num_hidden_layers of the deep encoder + 1, + 2 for the first shallow encoder, + 2 + # for the final shallow encoder + expected_num_layers = self.model_tester.num_hidden_layers + 1 + 2 + 2 + self.assertEqual(len(hidden_states), expected_num_layers) + + seq_length = self.model_tester.seq_length + for i in range(expected_num_layers): + if (i < 2) or ((expected_num_layers - i) < 3): + # the expected length of the hidden_states of the first and final shallow encoders + # is equal to the seq_length + self.assertListEqual( + list(hidden_states[i].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + else: + # the expected length of the hidden_states of the deep encoder need to be updated + # for CANINE since the seq length is downsampled + self.assertListEqual( + list(hidden_states[i].shape[-2:]), + [seq_length // config.downsampling_rate, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_len = getattr(self.model_tester, "seq_length", None) + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + # we add + 2 due to the 2 shallow encoders + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers + 2) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + # we add + 2 due to the 2 shallow encoders + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers + 2) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers + 2) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def set_nan_tensor_to_zero(t): + t[t != t] = 0 + return t + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (list, tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + torch.allclose( + set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 + ), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:" + f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has" + f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}." + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + print(model_class) + model = model_class(config) + model.to(torch_device) + model.eval() + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence( + model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} + ) + + def test_headmasking(self): + if not self.test_head_masking: + self.skipTest(reason="test_head_masking is set to False") + + global_rng.seed(42) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + global_rng.seed() + + inputs_dict["output_attentions"] = True + config.output_hidden_states = True + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + + # Prepare head_mask + # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) + head_mask = torch.ones( + self.model_tester.num_hidden_layers, + self.model_tester.num_attention_heads, + device=torch_device, + ) + head_mask[0, 0] = 0 + head_mask[-1, :-1] = 0 + head_mask.requires_grad_(requires_grad=True) + inputs = self._prepare_for_class(inputs_dict, model_class).copy() + inputs["head_mask"] = head_mask + + outputs = model(**inputs, return_dict=True) + + # Test that we can get a gradient back for importance score computation + output = sum(t.sum() for t in outputs[0]) + output = output.sum() + output.backward() + multihead_outputs = head_mask.grad + + self.assertIsNotNone(multihead_outputs) + self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers) + + def check_attentions_validity(attentions): + # Remove Nan + for t in attentions: + self.assertLess( + torch.sum(torch.isnan(t)), t.numel() / 4 + ) # Check we don't have more than 25% nans (arbitrary) + attentions = [ + t.masked_fill(torch.isnan(t), 0.0) for t in attentions + ] # remove them (the test is less complete) + + self.assertAlmostEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual(attentions[1][..., -1, :, :].flatten().sum().item(), 0.0) + self.assertAlmostEqual(attentions[-2][..., -2, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual(attentions[-2][..., -1, :, :].flatten().sum().item(), 0.0) + + check_attentions_validity(outputs.attentions) + + @unittest.skip(reason="CANINE does not have a get_input_embeddings() method.") + def test_inputs_embeds(self): + # ViT does not use inputs_embeds + pass + + @unittest.skip(reason="Canine Tower does not use inputs_embeds") + def test_inputs_embeds_matches_input_ids(self): + pass + + @unittest.skip(reason="CANINE does not have a get_input_embeddings() method.") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "google/canine-s" + model = CanineModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_torch +class CanineModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_no_head(self): + model = CanineModel.from_pretrained("google/canine-s") + # this one corresponds to the first example of the TydiQA dev set (in Swahili) + # fmt: off + input_ids = [57344, 57349, 85, 107, 117, 98, 119, 97, 32, 119, 97, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 111, 114, 105, 32, 110, 105, 32, 107, 105, 97, 115, 105, 32, 103, 97, 110, 105, 63, 57345, 57350, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 111, 114, 105, 32, 44, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 97, 117, 32, 105, 110, 103, 46, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 40, 112, 105, 97, 58, 32, 84, 111, 108, 105, 109, 97, 110, 32, 97, 117, 32, 82, 105, 103, 105, 108, 32, 75, 101, 110, 116, 97, 117, 114, 117, 115, 41, 32, 110, 105, 32, 110, 121, 111, 116, 97, 32, 105, 110, 97, 121, 111, 110, 103, 39, 97, 97, 32, 115, 97, 110, 97, 32, 107, 97, 116, 105, 107, 97, 32, 97, 110, 103, 97, 32, 121, 97, 32, 107, 117, 115, 105, 110, 105, 32, 107, 119, 101, 110, 121, 101, 32, 107, 117, 110, 100, 105, 110, 121, 111, 116, 97, 32, 121, 97, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 40, 112, 105, 97, 58, 32, 105, 110, 103, 46, 32, 67, 101, 110, 116, 97, 117, 114, 117, 115, 41, 46, 32, 78, 105, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 107, 117, 110, 103, 97, 97, 32, 115, 97, 110, 97, 32, 121, 97, 32, 110, 110, 101, 32, 97, 110, 103, 97, 110, 105, 32, 108, 97, 107, 105, 110, 105, 32, 104, 97, 105, 111, 110, 101, 107, 97, 110, 105, 32, 107, 119, 101, 110, 121, 101, 32, 110, 117, 115, 117, 100, 117, 110, 105, 97, 32, 121, 97, 32, 107, 97, 115, 107, 97, 122, 105, 110, 105, 46, 32, 57351, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 110, 105, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 112, 101, 107, 101, 101, 32, 107, 119, 97, 32, 115, 97, 98, 97, 98, 117, 32, 110, 105, 32, 110, 121, 111, 116, 97, 32, 121, 101, 116, 117, 32, 106, 105, 114, 97, 110, 105, 32, 107, 97, 116, 105, 107, 97, 32, 97, 110, 103, 97, 32, 105, 110, 97, 32, 117, 109, 98, 97, 108, 105, 32, 119, 97, 32, 109, 105, 97, 107, 97, 32, 121, 97, 32, 110, 117, 114, 117, 32, 52, 46, 50, 46, 32, 73, 110, 97, 111, 110, 101, 107, 97, 110, 97, 32, 97, 110, 103, 97, 110, 105, 32, 107, 97, 114, 105, 98, 117, 32, 110, 97, 32, 107, 117, 110, 100, 105, 110, 121, 111, 116, 97, 32, 121, 97, 32, 83, 97, 108, 105, 98, 117, 32, 40, 67, 114, 117, 120, 41, 46, 32, 57352, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 40, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 41, 32, 105, 110, 97, 111, 110, 101, 107, 97, 110, 97, 32, 107, 97, 109, 97, 32, 110, 121, 111, 116, 97, 32, 109, 111, 106, 97, 32, 108, 97, 107, 105, 110, 105, 32, 107, 119, 97, 32, 100, 97, 114, 117, 98, 105, 110, 105, 32, 107, 117, 98, 119, 97, 32, 105, 110, 97, 111, 110, 101, 107, 97, 110, 97, 32, 107, 117, 119, 97, 32, 109, 102, 117, 109, 111, 32, 119, 97, 32, 110, 121, 111, 116, 97, 32, 116, 97, 116, 117, 32, 122, 105, 110, 97, 122, 111, 107, 97, 97, 32, 107, 97, 114, 105, 98, 117, 32, 110, 97, 32, 107, 117, 115, 104, 105, 107, 97, 109, 97, 110, 97, 32, 107, 97, 116, 105, 32, 121, 97, 111, 46, 32, 78, 121, 111, 116, 97, 32, 109, 97, 112, 97, 99, 104, 97, 32, 122, 97, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 65, 32, 110, 97, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 66, 32, 122, 105, 107, 111, 32, 109, 105, 97, 107, 97, 32, 121, 97, 32, 110, 117, 114, 117, 32, 52, 46, 51, 54, 32, 107, 117, 116, 111, 107, 97, 32, 107, 119, 101, 116, 117, 32, 110, 97, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 116, 97, 116, 117, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 67, 32, 97, 117, 32, 80, 114, 111, 120, 105, 109, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 105, 110, 97, 32, 117, 109, 98, 97, 108, 105, 32, 119, 97, 32, 109, 105, 97, 107, 97, 32, 121, 97, 32, 110, 117, 114, 117, 32, 52, 46, 50, 50, 46, 32, 57353, 32, 80, 114, 111, 120, 105, 109, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 40, 121, 97, 97, 110, 105, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 105, 108, 105, 121, 111, 32, 107, 97, 114, 105, 98, 117, 32, 122, 97, 105, 100, 105, 32, 110, 97, 115, 105, 41, 32, 105, 109, 101, 103, 117, 110, 100, 117, 108, 105, 119, 97, 32, 107, 117, 119, 97, 32, 110, 97, 32, 115, 97, 121, 97, 114, 105, 32, 109, 111, 106, 97, 46, 32, 86, 105, 112, 105, 109, 111, 32, 118, 105, 110, 97, 118, 121, 111, 112, 97, 116, 105, 107, 97, 110, 97, 32, 104, 97, 100, 105, 32, 115, 97, 115, 97, 32, 122, 105, 110, 97, 111, 110, 121, 101, 115, 104, 97, 32, 117, 119, 101, 122, 101, 107, 97, 110, 111, 32, 109, 107, 117, 98, 119, 97, 32, 121, 97, 32, 107, 119, 97, 109, 98, 97, 32, 115, 97, 121, 97, 114, 105, 32, 104, 105, 105, 32, 110, 105, 32, 121, 97, 32, 109, 119, 97, 109, 98, 97, 32, 40, 107, 97, 109, 97, 32, 100, 117, 110, 105, 97, 32, 121, 101, 116, 117, 44, 32, 77, 105, 114, 105, 104, 105, 32, 97, 117, 32, 90, 117, 104, 117, 114, 97, 41, 32, 110, 97, 32, 105, 110, 97, 119, 101, 122, 97, 32, 107, 117, 119, 97, 32, 110, 97, 32, 97, 110, 103, 97, 104, 101, 119, 97, 44, 32, 116, 101, 110, 97, 32, 107, 97, 116, 105, 107, 97, 32, 117, 112, 101, 111, 32, 119, 97, 32, 106, 111, 116, 111, 32, 117, 110, 97, 111, 114, 117, 104, 117, 115, 117, 32, 107, 117, 119, 101, 112, 111, 32, 107, 119, 97, 32, 117, 104, 97, 105, 46, 32, 91, 49, 93, 57345, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + attention_mask = [1 if x != 0 else 0 for x in input_ids] + token_type_ids = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + # fmt: on + input_ids = torch.tensor([input_ids]) + attention_mask = torch.tensor([attention_mask]) + token_type_ids = torch.tensor([token_type_ids]) + outputs = model(input_ids, attention_mask, token_type_ids) + + # verify sequence output + expected_shape = torch.Size((1, 2048, 768)) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [ + [-0.161433131, 0.395568609, 0.0407391489], + [-0.108025983, 0.362060368, -0.544592619], + [-0.141537309, 0.180541009, 0.076907], + ] + ) + + torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-2, atol=1e-2) + + # verify pooled output + expected_shape = torch.Size((1, 768)) + self.assertEqual(outputs.pooler_output.shape, expected_shape) + + expected_slice = torch.tensor([-0.884311497, -0.529064834, 0.723164916]) + + torch.testing.assert_close(outputs.pooler_output[0, :3], expected_slice, rtol=1e-2, atol=1e-2) diff --git a/docs/transformers/tests/models/canine/test_tokenization_canine.py b/docs/transformers/tests/models/canine/test_tokenization_canine.py new file mode 100644 index 0000000000000000000000000000000000000000..6bcb54e33f0d4bed169874a36d906847b63759cb --- /dev/null +++ b/docs/transformers/tests/models/canine/test_tokenization_canine.py @@ -0,0 +1,339 @@ +# Copyright 2021 Google AI and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import tempfile +import unittest +from functools import lru_cache + +from transformers import BatchEncoding, CanineTokenizer +from transformers.testing_utils import require_tokenizers, require_torch +from transformers.tokenization_utils import AddedToken +from transformers.utils import cached_property + +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible + + +class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "nielsr/canine-s" + tokenizer_class = CanineTokenizer + test_rust_tokenizer = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + tokenizer = CanineTokenizer() + tokenizer.save_pretrained(cls.tmpdirname) + + @cached_property + def canine_tokenizer(self): + return CanineTokenizer.from_pretrained("google/canine-s") + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> CanineTokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + tokenizer = cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer._unicode_vocab_size = 1024 + return tokenizer + + @require_torch + def test_prepare_batch_integration(self): + tokenizer = self.canine_tokenizer + src_text = ["Life is like a box of chocolates.", "You never know what you're gonna get."] + expected_src_tokens = [57344, 76, 105, 102, 101, 32, 105, 115, 32, 108, 105, 107, 101, 32, 97, 32, 98, 111, 120, 32, 111, 102, 32, 99, 104, 111, 99, 111, 108, 97, 116, 101, 115, 46, 57345, 0, 0, 0, 0] # fmt: skip + batch = tokenizer(src_text, padding=True, return_tensors="pt") + self.assertIsInstance(batch, BatchEncoding) + + result = list(batch.input_ids.numpy()[0]) + + self.assertListEqual(expected_src_tokens, result) + + self.assertEqual((2, 39), batch.input_ids.shape) + self.assertEqual((2, 39), batch.attention_mask.shape) + + @require_torch + def test_encoding_keys(self): + tokenizer = self.canine_tokenizer + src_text = ["Once there was a man.", "He wrote a test in HuggingFace Transformers."] + batch = tokenizer(src_text, padding=True, return_tensors="pt") + # check if input_ids, attention_mask and token_type_ids are returned + self.assertIn("input_ids", batch) + self.assertIn("attention_mask", batch) + self.assertIn("token_type_ids", batch) + + @require_torch + def test_max_length_integration(self): + tokenizer = self.canine_tokenizer + tgt_text = [ + "What's the weater?", + "It's about 25 degrees.", + ] + targets = tokenizer( + text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors="pt" + ) + self.assertEqual(32, targets["input_ids"].shape[1]) + + # cannot use default save_and_load_tokenizer test method because tokenizer has no vocab + def test_save_and_load_tokenizer(self): + # safety check on max_len default value so we are sure the test works + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + self.assertNotEqual(tokenizer.model_max_length, 42) + + # Now let's start the test + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Isolate this from the other tests because we save additional tokens/etc + tmpdirname = tempfile.mkdtemp() + + sample_text = " He is very happy, UNwant\u00e9d,running" + before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) + tokenizer.save_pretrained(tmpdirname) + + after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) + after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False) + self.assertListEqual(before_tokens, after_tokens) + + shutil.rmtree(tmpdirname) + + tokenizers = self.get_tokenizers(model_max_length=42) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Isolate this from the other tests because we save additional tokens/etc + tmpdirname = tempfile.mkdtemp() + + sample_text = " He is very happy, UNwant\u00e9d,running" + + additional_special_tokens = tokenizer.additional_special_tokens + + # We can add a new special token for Canine as follows: + new_additional_special_token = chr(0xE007) + additional_special_tokens.append(new_additional_special_token) + tokenizer.add_special_tokens( + {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False + ) + before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) + tokenizer.save_pretrained(tmpdirname) + + after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) + after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False) + self.assertListEqual(before_tokens, after_tokens) + self.assertIn(new_additional_special_token, after_tokenizer.additional_special_tokens) + self.assertEqual(after_tokenizer.model_max_length, 42) + + tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43) + self.assertEqual(tokenizer.model_max_length, 43) + + shutil.rmtree(tmpdirname) + + def test_add_special_tokens(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + input_text, ids = self.get_clean_sequence(tokenizer) + + # a special token for Canine can be defined as follows: + SPECIAL_TOKEN = 0xE005 + special_token = chr(SPECIAL_TOKEN) + + tokenizer.add_special_tokens({"cls_token": special_token}) + encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False) + self.assertEqual(len(encoded_special_token), 1) + + text = tokenizer.decode(ids + encoded_special_token, clean_up_tokenization_spaces=False) + encoded = tokenizer.encode(text, add_special_tokens=False) + + input_encoded = tokenizer.encode(input_text, add_special_tokens=False) + special_token_id = tokenizer.encode(special_token, add_special_tokens=False) + self.assertEqual(encoded, input_encoded + special_token_id) + + decoded = tokenizer.decode(encoded, skip_special_tokens=True) + self.assertTrue(special_token not in decoded) + + def test_tokenize_special_tokens(self): + tokenizers = self.get_tokenizers(do_lower_case=True) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + SPECIAL_TOKEN_1 = chr(0xE005) + SPECIAL_TOKEN_2 = chr(0xE006) + tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True) + tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]}) + + token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1) + token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2) + + self.assertEqual(len(token_1), 1) + self.assertEqual(len(token_2), 1) + self.assertEqual(token_1[0], SPECIAL_TOKEN_1) + self.assertEqual(token_2[0], SPECIAL_TOKEN_2) + + @require_tokenizers + def test_added_token_serializable(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # a special token for Canine can be defined as follows: + NEW_TOKEN = 0xE006 + new_token = chr(NEW_TOKEN) + + new_token = AddedToken(new_token, lstrip=True) + tokenizer.add_special_tokens({"additional_special_tokens": [new_token]}) + + with tempfile.TemporaryDirectory() as tmp_dir_name: + tokenizer.save_pretrained(tmp_dir_name) + tokenizer.from_pretrained(tmp_dir_name) + + def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self): + tokenizer_list = [] + if self.test_slow_tokenizer: + tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) + + if self.test_rust_tokenizer: + tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) + + for tokenizer_class, tokenizer_utils in tokenizer_list: + with tempfile.TemporaryDirectory() as tmp_dir: + tokenizer_utils.save_pretrained(tmp_dir) + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file: + special_tokens_map = json.load(json_file) + + with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file: + tokenizer_config = json.load(json_file) + + # a special token for Canine can be defined as follows: + NEW_TOKEN = 0xE006 + new_token_1 = chr(NEW_TOKEN) + + special_tokens_map["additional_special_tokens"] = [new_token_1] + tokenizer_config["additional_special_tokens"] = [new_token_1] + + with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile: + json.dump(special_tokens_map, outfile) + with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile: + json.dump(tokenizer_config, outfile) + + # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes + # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and + # "special_tokens_map.json" files + tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir, extra_ids=0) + self.assertIn(new_token_1, tokenizer_without_change_in_init.additional_special_tokens) + # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab + self.assertEqual( + [new_token_1], + tokenizer_without_change_in_init.convert_ids_to_tokens( + tokenizer_without_change_in_init.convert_tokens_to_ids([new_token_1]) + ), + ) + + NEW_TOKEN = 0xE007 + new_token_2 = chr(NEW_TOKEN) + # Now we test that we can change the value of additional_special_tokens in the from_pretrained + new_added_tokens = [AddedToken(new_token_2, lstrip=True)] + tokenizer = tokenizer_class.from_pretrained( + tmp_dir, additional_special_tokens=new_added_tokens, extra_ids=0 + ) + + self.assertIn(new_token_2, tokenizer.additional_special_tokens) + # self.assertIn(new_token_2,tokenizer.get_vocab()) # ByT5Tokenization no vocab + self.assertEqual( + [new_token_2], tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_token_2])) + ) + + @require_tokenizers + def test_encode_decode_with_spaces(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + input = "hello world" + if self.space_between_special_tokens: + output = "[CLS] hello world [SEP]" + else: + output = input + encoded = tokenizer.encode(input, add_special_tokens=False) + decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens) + self.assertIn(decoded, [output, output.lower()]) + + # cannot use default `test_tokenizers_common_ids_setters` method because tokenizer has no vocab + def test_tokenizers_common_ids_setters(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + attributes_list = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + ] + + token_to_test_setters = "a" + token_id_to_test_setters = ord(token_to_test_setters) + + for attr in attributes_list: + setattr(tokenizer, attr + "_id", None) + self.assertEqual(getattr(tokenizer, attr), None) + self.assertEqual(getattr(tokenizer, attr + "_id"), None) + + setattr(tokenizer, attr + "_id", token_id_to_test_setters) + self.assertEqual(getattr(tokenizer, attr), token_to_test_setters) + self.assertEqual(getattr(tokenizer, attr + "_id"), token_id_to_test_setters) + + setattr(tokenizer, "additional_special_tokens_ids", []) + self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), []) + self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), []) + + additional_special_token_id = 0xE006 + additional_special_token = chr(additional_special_token_id) + setattr(tokenizer, "additional_special_tokens_ids", [additional_special_token_id]) + self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [additional_special_token]) + self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [additional_special_token_id]) + + @unittest.skip(reason="tokenizer has a fixed vocab_size (namely all possible unicode code points)") + def test_add_tokens_tokenizer(self): + pass + + # CanineTokenizer does not support do_lower_case = True, as each character has its own Unicode code point + # ("b" and "B" for example have different Unicode code points) + @unittest.skip(reason="CanineTokenizer does not support do_lower_case = True") + def test_added_tokens_do_lower_case(self): + pass + + @unittest.skip(reason="CanineModel does not support the get_input_embeddings nor the get_vocab method") + def test_np_encode_plus_sent_to_model(self): + pass + + @unittest.skip(reason="CanineModel does not support the get_input_embeddings nor the get_vocab method") + def test_torch_encode_plus_sent_to_model(self): + pass + + @unittest.skip(reason="CanineTokenizer does not have vocabulary") + def test_get_vocab(self): + pass + + @unittest.skip(reason="inputs cannot be pretokenized since ids depend on whole input string") + def test_pretokenized_inputs(self): + pass + + @unittest.skip(reason="CanineTokenizer does not have vocabulary") + def test_conversion_reversible(self): + pass diff --git a/docs/transformers/tests/models/chameleon/__init__.py b/docs/transformers/tests/models/chameleon/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/chameleon/test_image_processing_chameleon.py b/docs/transformers/tests/models/chameleon/test_image_processing_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..fcbd7b46d55522db299eac79ddecbabd22c424e5 --- /dev/null +++ b/docs/transformers/tests/models/chameleon/test_image_processing_chameleon.py @@ -0,0 +1,204 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import ChameleonImageProcessor + + +class ChameleonImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=200, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=[1.0, 1.0, 1.0], + image_std=[1.0, 1.0, 1.0], + do_convert_rgb=True, + ): + size = size if size is not None else {"shortest_edge": 18} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape + def expected_output_image_shape(self, images): + return self.num_channels, self.crop_size["height"], self.crop_size["width"] + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class ChameleonImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = ChameleonImageProcessor if is_vision_available() else None + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->Chameleon + def setUp(self): + super().setUp() + self.image_processor_tester = ChameleonImageProcessingTester(self) + + @property + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 18}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) + self.assertEqual(image_processor.size, {"shortest_edge": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) + + def test_call_pil(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = (1, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + def test_call_numpy(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = (1, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + def test_call_pytorch(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True) + + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = (1, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + def test_nested_input(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) + + # Test batched as a list of images + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched as a nested list of images, where each sublist is one batch + image_inputs_nested = [image_inputs[:3], image_inputs[3:]] + encoded_images_nested = image_processing(image_inputs_nested, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 3, 18, 18) + self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape) + + # Image processor should return same pixel values, independently of input format + self.assertTrue((encoded_images_nested == encoded_images).all()) diff --git a/docs/transformers/tests/models/chameleon/test_modeling_chameleon.py b/docs/transformers/tests/models/chameleon/test_modeling_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..c393c9cf88d69891ac6004ee4beb0eba68897e9e --- /dev/null +++ b/docs/transformers/tests/models/chameleon/test_modeling_chameleon.py @@ -0,0 +1,481 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch chameleon model.""" + +import copy +import unittest + +import requests +from parameterized import parameterized + +from transformers import ChameleonConfig, is_torch_available, is_vision_available, set_seed +from transformers.testing_utils import ( + require_bitsandbytes, + require_read_token, + require_torch, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_vision_available(): + from PIL import Image + +if is_torch_available(): + import torch + + from transformers import ( + ChameleonForConditionalGeneration, + ChameleonModel, + ChameleonProcessor, + ) + + +class ChameleonModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=35, + is_training=False, + use_input_mask=True, + use_labels=True, + vocab_size=99, + image_token_id=4, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=2, + num_key_value_heads=2, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=0, + vq_num_embeds=5, + vq_embed_dim=5, + vq_channel_multiplier=[1, 4], + vq_img_token_start_id=10, # has to be less than vocab size when added with vq_num_embeds + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.image_token_id = image_token_id + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + self.vq_num_embeds = vq_num_embeds + self.vq_embed_dim = vq_embed_dim + self.vq_channel_multiplier = vq_channel_multiplier + self.vq_img_token_start_id = vq_img_token_start_id + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + # create dummy vocab map for image2bpe mapping if it needs remapping + # we assume that vocab size is big enough to account for image tokens somewhere in the beginning + # same way as in real ckpt, when img tokens are in first half of embeds + # we will need "vq_num_embeds" amount of tokens + + vocab_map = {i: chr(i) for i in range(self.vocab_size)} + vocab_map[self.image_token_id] = "" + start = self.vq_img_token_start_id + end = self.vq_img_token_start_id + self.vq_num_embeds + for i in range(start, end): + image_token_infix = "".join(chr(ord("A") + int(c)) for c in str(i)) + # dummy str for each image token, anything starting with IMGIMG + vocab_map[i] = f"IMGIMG{image_token_infix}Z" + + return ChameleonConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + num_key_value_heads=self.num_key_value_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + vocabulary_map={v: k for k, v in vocab_map.items()}, + vq_config=self.get_vq_config(), + ) + + def get_vq_config(self): + return { + "embed_dim": self.vq_embed_dim, + "num_embeddings": self.vq_num_embeds, + "latent_channels": self.vq_embed_dim, + "in_channels": 3, + "base_channels": 32, # we have a GroupNorm of 32 groups, so can't do less + "channel_multiplier": self.vq_channel_multiplier, + } + + def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + model = ChameleonModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (ChameleonModel, ChameleonForConditionalGeneration) if is_torch_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": ChameleonModel, + "text-generation": ChameleonForConditionalGeneration, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + fx_compatible = False + + def setUp(self): + self.model_tester = ChameleonModelTester(self) + self.config_tester = ConfigTester(self, config_class=ChameleonConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @parameterized.expand([("linear",), ("dynamic",)]) + def test_model_rope_scaling(self, scaling_type): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + short_input = ids_tensor([1, 10], config.vocab_size) + long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + original_model = ChameleonModel(config) + original_model.to(torch_device) + original_model.eval() + original_short_output = original_model(short_input).last_hidden_state + original_long_output = original_model(long_input).last_hidden_state + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + config.rope_scaling = {"type": scaling_type, "factor": 10.0} + scaled_model = ChameleonModel(config) + scaled_model.to(torch_device) + scaled_model.eval() + scaled_short_output = scaled_model(short_input).last_hidden_state + scaled_long_output = scaled_model(long_input).last_hidden_state + + # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original + # maximum sequence length, so the outputs for the short input should match. + if scaling_type == "dynamic": + torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) + else: + self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) + + # The output should be different for long inputs + self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + + @unittest.skip("Chameleon forces some token ids to be -inf!") + def test_batching_equivalence(self): + pass + + @unittest.skip("Chameleon VQ model cannot be squishes more due to hardcoded layer params in model code") + def test_model_is_small(self): + pass + + +class ChameleonVision2SeqModelTester(ChameleonModelTester): + def __init__(self, parent, image_size=10, **kwargs): + super().__init__(parent, **kwargs) + self.image_size = image_size + self.image_seq_length = 25 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + input_ids[input_ids == self.image_token_id] = self.pad_token_id + input_ids[:, : self.image_seq_length] = self.image_token_id + attention_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + pixel_values = floats_tensor([self.batch_size, 3, self.image_size, self.image_size]) + + config = self.get_config() + + return config, input_ids, attention_mask, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, pixel_values = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class ChameleonVision2SeqModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = (ChameleonModel, ChameleonForConditionalGeneration) if is_torch_available() else () + pipeline_model_mapping = ( + { + "image-text-to-text": ChameleonForConditionalGeneration, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + fx_compatible = False + + def setUp(self): + self.model_tester = ChameleonVision2SeqModelTester(self) + self.config_tester = ConfigTester(self, config_class=ChameleonConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip("Chameleon forces some token ids to be -inf!") + def test_batching_equivalence(self): + pass + + @unittest.skip("Chameleon cannot do offload because it uses `self.linear.weight` in forward") + def test_cpu_offload(self): + pass + + @unittest.skip("Chameleon cannot do offload because it uses `self.linear.weight` in forward") + def test_disk_offload_bin(self): + pass + + @unittest.skip("Chameleon cannot do offload because it uses `self.linear.weight` in forward") + def test_disk_offload_safetensors(self): + pass + + @unittest.skip("Chameleon VQ model cannot be squishes more due to hardcoded layer params in model code") + def test_model_is_small(self): + pass + + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + curr_input_dict = copy.deepcopy(input_dict) # the below tests modify dict in-place + _ = model(**curr_input_dict) # successful forward with no modifications + + # remove one image but leave the image token in text + curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**curr_input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = curr_input_dict["input_ids"][:1] + pixel_values = curr_input_dict["pixel_values"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + # while some other models require pixel_values to be present + def test_inputs_embeds_matches_input_ids(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + inputs_embeds = model.get_input_embeddings()(input_ids) + + with torch.no_grad(): + out_ids = model(input_ids=input_ids, **inputs)[0] + out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] + torch.testing.assert_close(out_embeds, out_ids) + + +@require_torch +class ChameleonIntegrationTest(unittest.TestCase): + @slow + @require_bitsandbytes + @require_read_token + def test_model_7b(self): + model = ChameleonForConditionalGeneration.from_pretrained( + "facebook/chameleon-7b", load_in_4bit=True, device_map="auto" + ) + processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") + + image = Image.open( + requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw + ) + prompt = "Describe what do you see here and tell me about the history behind it?" + + inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.float16) + + # greedy generation outputs + EXPECTED_TEXT_COMPLETION = ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in'] # fmt: skip + generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False) + text = processor.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, text) + + @slow + @require_bitsandbytes + @require_read_token + def test_model_7b_batched(self): + model = ChameleonForConditionalGeneration.from_pretrained( + "facebook/chameleon-7b", load_in_4bit=True, device_map="auto" + ) + processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") + + image = Image.open( + requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw + ) + image_2 = Image.open( + requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw + ) + prompts = [ + "Describe what do you see here and tell me about the history behind it?", + "What constellation is this image showing?", + ] + + inputs = processor(images=[image, image_2], text=prompts, padding=True, return_tensors="pt").to( + model.device, torch.float16 + ) + + # greedy generation outputs + EXPECTED_TEXT_COMPLETION = [ + 'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in', + 'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.' + ] # fmt: skip + generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False) + text = processor.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, text) + + @slow + @require_bitsandbytes + @require_read_token + def test_model_7b_multi_image(self): + model = ChameleonForConditionalGeneration.from_pretrained( + "facebook/chameleon-7b", load_in_4bit=True, device_map="auto" + ) + processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") + + image = Image.open( + requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw + ) + image_2 = Image.open( + requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw + ) + prompt = "What do these two images have in common?" + + inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.float16) + + # greedy generation outputs + EXPECTED_TEXT_COMPLETION = ['What do these two images have in common?The two images show a connection between the night sky and the internet. The first image shows a starry night sky, with the stars arranged in a pattern that resembles the structure of the internet. The'] # fmt: skip + generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False) + text = processor.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, text) diff --git a/docs/transformers/tests/models/chameleon/test_processor_chameleon.py b/docs/transformers/tests/models/chameleon/test_processor_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..d11321c9a8701652b80c9e332f7957adbfaba81d --- /dev/null +++ b/docs/transformers/tests/models/chameleon/test_processor_chameleon.py @@ -0,0 +1,76 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch chameleon model.""" + +import tempfile +import unittest + +from transformers import ChameleonProcessor, LlamaTokenizer +from transformers.testing_utils import get_tests_dir +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import ChameleonImageProcessor + + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + + +class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = ChameleonProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + image_processor = ChameleonImageProcessor() + tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB) + tokenizer.pad_token_id = 0 + tokenizer.sep_token_id = 1 + tokenizer.add_special_tokens({"additional_special_tokens": [""]}) + processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2) + processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token + + def test_special_mm_token_truncation(self): + """Tests that special vision tokens do not get truncated when `truncation=True` is set.""" + + processor = self.get_processor() + + input_str = self.prepare_text_inputs(batch_size=2, modality="image") + image_input = self.prepare_image_inputs(batch_size=2) + + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=None, + padding=True, + ) + + with self.assertRaises(ValueError): + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=True, + padding=True, + max_length=20, + ) + + @staticmethod + def prepare_processor_dict(): + return {"image_seq_length": 2} # fmt: skip diff --git a/docs/transformers/tests/models/chinese_clip/__init__.py b/docs/transformers/tests/models/chinese_clip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/chinese_clip/test_image_processing_chinese_clip.py b/docs/transformers/tests/models/chinese_clip/test_image_processing_chinese_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..7acae860b08aa64d4ffae69989ea20780efcdfe1 --- /dev/null +++ b/docs/transformers/tests/models/chinese_clip/test_image_processing_chinese_clip.py @@ -0,0 +1,175 @@ +# Copyright 2021 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_vision_available(): + from transformers import ChineseCLIPImageProcessor + + if is_torchvision_available(): + from transformers import ChineseCLIPImageProcessorFast + + +class ChineseCLIPImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + do_convert_rgb=True, + ): + size = size if size is not None else {"height": 224, "width": 224} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + def expected_output_image_shape(self, images): + return 3, self.crop_size["height"], self.crop_size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None + fast_image_processing_class = ChineseCLIPImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = ChineseCLIPImageProcessingTester(self, do_center_crop=True) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 224, "width": 224}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) + self.assertEqual(image_processor.size, {"shortest_edge": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) + + @unittest.skip( + reason="ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet" + ) # FIXME Amy + def test_call_numpy_4_channels(self): + pass + + +@require_torch +@require_vision +class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None + fast_image_processing_class = ChineseCLIPImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = ChineseCLIPImageProcessingTester(self, num_channels=4, do_center_crop=True) + self.expected_encoded_image_num_channels = 3 + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + @unittest.skip(reason="ChineseCLIPImageProcessor does not support 4 channels yet") # FIXME Amy + def test_call_numpy(self): + return super().test_call_numpy() + + @unittest.skip(reason="ChineseCLIPImageProcessor does not support 4 channels yet") # FIXME Amy + def test_call_pytorch(self): + return super().test_call_torch() + + @unittest.skip( + reason="ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet" + ) # FIXME Amy + def test_call_numpy_4_channels(self): + pass diff --git a/docs/transformers/tests/models/chinese_clip/test_modeling_chinese_clip.py b/docs/transformers/tests/models/chinese_clip/test_modeling_chinese_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..520ff2af3dd92527d4c7ab276a4a89c042dd26f7 --- /dev/null +++ b/docs/transformers/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -0,0 +1,762 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Chinese-CLIP model.""" + +import inspect +import os +import tempfile +import unittest + +import numpy as np +import requests + +from transformers import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig +from transformers.models.auto import get_values +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + _config_zero_init, + floats_tensor, + ids_tensor, + random_attention_mask, +) +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ( + MODEL_FOR_PRETRAINING_MAPPING, + ChineseCLIPModel, + ChineseCLIPTextModel, + ChineseCLIPVisionModel, + ) + + +if is_vision_available(): + from PIL import Image + + from transformers import ChineseCLIPProcessor + + +class ChineseCLIPTextModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + """ + Returns a tiny configuration by default. + """ + return ChineseCLIPTextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + config.is_decoder = True + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ChineseCLIPTextModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_model_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.add_cross_attention = True + model = ChineseCLIPTextModel(config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +class ChineseCLIPVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return ChineseCLIPVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = ChineseCLIPVisionModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class ChineseCLIPTextModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (ChineseCLIPTextModel,) if is_torch_available() else () + fx_compatible = False + + # special case for ForPreTraining model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING): + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device + ) + inputs_dict["next_sentence_label"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + return inputs_dict + + def setUp(self): + self.model_tester = ChineseCLIPTextModelTester(self) + self.config_tester = ConfigTester(self, config_class=ChineseCLIPTextConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_as_decoder(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) + + def test_model_as_decoder_with_default_input_mask(self): + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = self.model_tester.prepare_config_and_inputs_for_decoder() + + input_mask = None + + self.model_tester.create_and_check_model_as_decoder( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + @slow + def test_model_from_pretrained(self): + model_name = "OFA-Sys/chinese-clip-vit-base-patch16" + model = ChineseCLIPTextModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + +@require_torch +class ChineseCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as CHINESE_CLIP does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (ChineseCLIPVisionModel,) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = ChineseCLIPVisionModelTester(self) + self.config_tester = ConfigTester( + self, config_class=ChineseCLIPVisionConfig, has_text_modality=False, hidden_size=37 + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="CHINESE_CLIP does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "OFA-Sys/chinese-clip-vit-base-patch16" + model = ChineseCLIPVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class ChineseCLIPModelTester: + def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): + if text_kwargs is None: + text_kwargs = {} + if vision_kwargs is None: + vision_kwargs = {} + + self.parent = parent + self.text_model_tester = ChineseCLIPTextModelTester(parent, **text_kwargs) + self.vision_model_tester = ChineseCLIPVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test + self.is_training = is_training + + def prepare_config_and_inputs(self): + ( + config, + input_ids, + token_type_ids, + attention_mask, + _, + __, + ___, + ) = self.text_model_tester.prepare_config_and_inputs() + vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, token_type_ids, attention_mask, pixel_values + + def get_config(self): + return ChineseCLIPConfig.from_text_vision_configs( + self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64 + ) + + def create_and_check_model(self, config, input_ids, token_type_ids, attention_mask, pixel_values): + model = ChineseCLIPModel(config).to(torch_device).eval() + with torch.no_grad(): + result = model(input_ids, pixel_values, attention_mask, token_type_ids) + self.parent.assertEqual( + result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size) + ) + self.parent.assertEqual( + result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, token_type_ids, attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "return_loss": True, + } + return config, inputs_dict + + +@require_torch +class ChineseCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (ChineseCLIPModel,) if is_torch_available() else () + pipeline_model_mapping = {"feature-extraction": ChineseCLIPModel} if is_torch_available() else {} + fx_compatible = False + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + + def setUp(self): + text_kwargs = {"use_labels": False, "batch_size": 12} + vision_kwargs = {"batch_size": 12} + self.model_tester = ChineseCLIPModelTester(self, text_kwargs, vision_kwargs) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="Hidden_states is tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="Inputs_embeds is tested in individual model tests") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="ChineseCLIPModel does not have input/output embeddings") + def test_model_get_set_embeddings(self): + pass + + # override as the `logit_scale` parameter initialization is different for CHINESE_CLIP + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for sub_config_key in ("vision_config", "text_config"): + sub_config = getattr(configs_no_init, sub_config_key, {}) + setattr(configs_no_init, sub_config_key, _config_zero_init(sub_config)) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + # check if `logit_scale` is initialized as per the original implementation + if name == "logit_scale": + self.assertAlmostEqual( + param.data.item(), + np.log(1 / 0.07), + delta=1e-3, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def _create_and_check_torchscript(self, config, inputs_dict): + if not self.test_torchscript: + self.skipTest(reason="test_torchscript is set to False") + + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.torchscript = True + configs_no_init.return_dict = False + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + + try: + input_ids = inputs_dict["input_ids"] + pixel_values = inputs_dict["pixel_values"] # CHINESE_CLIP needs pixel_values + traced_model = torch.jit.trace(model, (input_ids, pixel_values)) + except RuntimeError: + self.fail("Couldn't trace module.") + + with tempfile.TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") + + try: + torch.jit.save(traced_model, pt_file_name) + except Exception: + self.fail("Couldn't save module.") + + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load module.") + + model.to(torch_device) + model.eval() + + loaded_model.to(torch_device) + loaded_model.eval() + + model_state_dict = model.state_dict() + loaded_model_state_dict = loaded_model.state_dict() + + non_persistent_buffers = {} + for key in loaded_model_state_dict.keys(): + if key not in model_state_dict.keys(): + non_persistent_buffers[key] = loaded_model_state_dict[key] + + loaded_model_state_dict = { + key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers + } + + self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) + + model_buffers = list(model.buffers()) + for non_persistent_buffer in non_persistent_buffers.values(): + found_buffer = False + for i, model_buffer in enumerate(model_buffers): + if torch.equal(non_persistent_buffer, model_buffer): + found_buffer = True + break + + self.assertTrue(found_buffer) + model_buffers.pop(i) + + models_equal = True + for layer_name, p1 in model_state_dict.items(): + p2 = loaded_model_state_dict[layer_name] + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + @slow + def test_model_from_pretrained(self): + model_name = "OFA-Sys/chinese-clip-vit-base-patch16" + model = ChineseCLIPModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of Pikachu +def prepare_img(): + url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@require_vision +@require_torch +class ChineseCLIPModelIntegrationTest(unittest.TestCase): + @slow + def test_inference(self): + model_name = "OFA-Sys/chinese-clip-vit-base-patch16" + model = ChineseCLIPModel.from_pretrained(model_name).to(torch_device) + processor = ChineseCLIPProcessor.from_pretrained(model_name) + + image = prepare_img() + inputs = processor( + text=["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], images=image, padding=True, return_tensors="pt" + ).to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + self.assertEqual( + outputs.logits_per_image.shape, + torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), + ) + self.assertEqual( + outputs.logits_per_text.shape, + torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), + ) + + probs = outputs.logits_per_image.softmax(dim=1) + expected_probs = torch.tensor([[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]], device=torch_device) + + torch.testing.assert_close(probs, expected_probs, rtol=5e-3, atol=5e-3) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model_name = "OFA-Sys/chinese-clip-vit-base-patch16" + model = ChineseCLIPModel.from_pretrained(model_name).to(torch_device) + + image_processor = ChineseCLIPProcessor.from_pretrained( + model_name, size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 122, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.3990, 0.2983, -0.1239], [-0.1452, -0.2759, 0.0403], [-0.3149, -0.4763, 0.8555]] + ).to(torch_device) + + torch.testing.assert_close( + outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4 + ) diff --git a/docs/transformers/tests/models/chinese_clip/test_processor_chinese_clip.py b/docs/transformers/tests/models/chinese_clip/test_processor_chinese_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..3c2a2247c5a2194f3d4a914ebdd3426f79eab70d --- /dev/null +++ b/docs/transformers/tests/models/chinese_clip/test_processor_chinese_clip.py @@ -0,0 +1,217 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import tempfile +import unittest + +import pytest + +from transformers import BertTokenizer, BertTokenizerFast +from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES +from transformers.testing_utils import require_vision +from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import ChineseCLIPImageProcessor, ChineseCLIPProcessor + + +@require_vision +class ChineseCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = ChineseCLIPProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + vocab_tokens = [ + "[UNK]", + "[CLS]", + "[SEP]", + "[PAD]", + "[MASK]", + "的", + "价", + "格", + "是", + "15", + "便", + "alex", + "##andra", + ",", + "。", + "-", + "t", + "shirt", + ] + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + image_processor_map = { + "do_resize": True, + "size": {"height": 224, "width": 224}, + "do_center_crop": True, + "crop_size": {"height": 18, "width": 18}, + "do_normalize": True, + "image_mean": [0.48145466, 0.4578275, 0.40821073], + "image_std": [0.26862954, 0.26130258, 0.27577711], + "do_convert_rgb": True, + } + cls.image_processor_file = os.path.join(cls.tmpdirname, FEATURE_EXTRACTOR_NAME) + with open(cls.image_processor_file, "w", encoding="utf-8") as fp: + json.dump(image_processor_map, fp) + + tokenizer = cls.get_tokenizer() + image_processor = cls.get_image_processor() + processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) + processor.save_pretrained(cls.tmpdirname) + + @classmethod + def get_tokenizer(cls, **kwargs): + return BertTokenizer.from_pretrained(cls.tmpdirname, **kwargs) + + @classmethod + def get_rust_tokenizer(cls, **kwargs): + return BertTokenizerFast.from_pretrained(cls.tmpdirname, **kwargs) + + @classmethod + def get_image_processor(cls, **kwargs): + return ChineseCLIPImageProcessor.from_pretrained(cls.tmpdirname, **kwargs) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + def test_save_load_pretrained_default(self): + tokenizer_slow = self.get_tokenizer() + tokenizer_fast = self.get_rust_tokenizer() + image_processor = self.get_image_processor() + + with tempfile.TemporaryDirectory() as tmpdir: + processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) + processor_slow.save_pretrained(tmpdir) + processor_slow = ChineseCLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False) + + processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast.save_pretrained(tmpdir) + processor_fast = ChineseCLIPProcessor.from_pretrained(self.tmpdirname) + + self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) + self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) + self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) + self.assertIsInstance(processor_slow.tokenizer, BertTokenizer) + self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) + + self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertIsInstance(processor_slow.image_processor, ChineseCLIPImageProcessor) + self.assertIsInstance(processor_fast.image_processor, ChineseCLIPImageProcessor) + + def test_save_load_pretrained_additional_features(self): + with tempfile.TemporaryDirectory() as tmpdir: + processor = ChineseCLIPProcessor( + tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor() + ) + processor.save_pretrained(tmpdir) + + tokenizer_add_kwargs = self.get_tokenizer(cls_token="(CLS)", sep_token="(SEP)") + image_processor_add_kwargs = self.get_image_processor(do_normalize=False) + + processor = ChineseCLIPProcessor.from_pretrained( + tmpdir, cls_token="(CLS)", sep_token="(SEP)", do_normalize=False + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, BertTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, ChineseCLIPImageProcessor) + + def test_image_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) + + image_input = self.prepare_image_inputs() + + input_feat_extract = image_processor(image_input, return_tensors="np") + input_processor = processor(images=image_input, return_tensors="np") + + for key in input_feat_extract.keys(): + self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) + + def test_tokenizer(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "Alexandra,T-shirt的价格是15便士。" + + encoded_processor = processor(text=input_str) + + encoded_tok = tokenizer(input_str) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + def test_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "Alexandra,T-shirt的价格是15便士。" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"]) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + def test_tokenizer_decode(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + def test_model_input_names(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "Alexandra,T-shirt的价格是15便士。" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), processor.model_input_names) diff --git a/docs/transformers/tests/models/clap/__init__.py b/docs/transformers/tests/models/clap/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/clap/test_feature_extraction_clap.py b/docs/transformers/tests/models/clap/test_feature_extraction_clap.py new file mode 100644 index 0000000000000000000000000000000000000000..b2ccb5017135b2a5bc194402b2b5195d9d125410 --- /dev/null +++ b/docs/transformers/tests/models/clap/test_feature_extraction_clap.py @@ -0,0 +1,546 @@ +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import itertools +import random +import unittest + +import numpy as np +from datasets import load_dataset + +from transformers import ClapFeatureExtractor +from transformers.testing_utils import require_torch, require_torchaudio +from transformers.trainer_utils import set_seed +from transformers.utils.import_utils import is_torch_available + +from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin + + +if is_torch_available(): + import torch + +global_rng = random.Random() + + +# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list +def floats_list(shape, scale=1.0, rng=None, name=None): + """Creates a random float32 tensor""" + if rng is None: + rng = global_rng + + values = [] + for batch_idx in range(shape[0]): + values.append([]) + for _ in range(shape[1]): + values[-1].append(rng.random() * scale) + + return values + + +@require_torch +@require_torchaudio +# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->Clap +class ClapFeatureExtractionTester: + def __init__( + self, + parent, + batch_size=7, + min_seq_length=400, + max_seq_length=2000, + feature_size=10, + hop_length=160, + chunk_length=8, + padding_value=0.0, + sampling_rate=4_000, + return_attention_mask=False, + do_normalize=True, + ): + self.parent = parent + self.batch_size = batch_size + self.min_seq_length = min_seq_length + self.max_seq_length = max_seq_length + self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1) + self.padding_value = padding_value + self.sampling_rate = sampling_rate + self.return_attention_mask = return_attention_mask + self.do_normalize = do_normalize + self.feature_size = feature_size + self.chunk_length = chunk_length + self.hop_length = hop_length + + def prepare_feat_extract_dict(self): + return { + "feature_size": self.feature_size, + "hop_length": self.hop_length, + "chunk_length": self.chunk_length, + "padding_value": self.padding_value, + "sampling_rate": self.sampling_rate, + "return_attention_mask": self.return_attention_mask, + "do_normalize": self.do_normalize, + } + + def prepare_inputs_for_common(self, equal_length=False, numpify=False): + def _flatten(list_of_lists): + return list(itertools.chain(*list_of_lists)) + + if equal_length: + speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)] + else: + # make sure that inputs increase in size + speech_inputs = [ + floats_list((x, self.feature_size)) + for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff) + ] + if numpify: + speech_inputs = [np.asarray(x) for x in speech_inputs] + return speech_inputs + + +@require_torch +@require_torchaudio +class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): + feature_extraction_class = ClapFeatureExtractor + + # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.setUp with Whisper->Clap + def setUp(self): + self.feat_extract_tester = ClapFeatureExtractionTester(self) + + def test_call(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + # create three inputs of length 800, 1000, and 1200 + speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] + + # Test feature size + input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features + self.assertTrue(input_features.ndim == 4) + + # Test not batched input + encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features + encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features + self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) + + # Test batched + encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features + encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + # Test 2-D numpy arrays are batched. + speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)] + np_speech_inputs = np.asarray(speech_inputs) + encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features + encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad + def test_double_precision_pad(self): + import torch + + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + np_speech_inputs = np.random.rand(100, 32).astype(np.float64) + py_speech_inputs = np_speech_inputs.tolist() + + for inputs in [py_speech_inputs, np_speech_inputs]: + np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np") + self.assertTrue(np_processed.input_features.dtype == np.float32) + pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt") + self.assertTrue(pt_processed.input_features.dtype == torch.float32) + + # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest._load_datasamples + def _load_datasamples(self, num_samples): + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + + return [x["array"] for x in speech_samples] + + def test_integration_fusion_short_input(self): + # fmt: off + EXPECTED_INPUT_FEATURES = torch.tensor( + [ + [ + # "repeat" + [ + -20.1049, -19.9764, -20.0731, -19.5055, -27.5018, -22.5761, -26.6071, + -29.0091, -26.4659, -26.4236, -28.8808, -31.9190, -32.4848, -34.1186, + -34.0340, -32.8803, -30.9895, -37.6238, -38.0347, -40.6263, -36.3496, + -42.2533, -32.9132, -27.7068, -29.3704, -30.3208, -22.5972, -27.1494, + -30.1975, -31.1005, -29.9372, -27.1917, -25.9806, -30.3489, -33.2380, + -31.9062, -36.5498, -32.8721, -30.5629, -27.4674, -22.2232, -22.5653, + -16.3868, -17.2713, -25.9738, -30.6256, -34.3766, -31.1292, -27.8950, + -27.0588, -25.6206, -23.0712, -26.6050, -28.0112, -32.6847, -34.3396, + -34.9738, -35.8463, -39.2324, -37.1188, -33.3705, -28.9230, -28.9112, + -28.6578 + ], + [ + -36.7233, -30.0587, -24.8431, -18.4611, -16.8149, -23.9319, -32.8580, + -34.2264, -27.4332, -26.8027, -29.2721, -33.9033, -39.3403, -35.3232, + -26.8076, -28.6460, -35.2780, -36.0738, -35.4996, -37.7631, -39.5056, + -34.7112, -36.8741, -34.1066, -32.9474, -33.6604, -27.9937, -30.9594, + -26.2928, -32.0485, -29.2151, -29.2917, -32.7308, -29.6542, -31.1454, + -37.0088, -32.3388, -37.3086, -31.1024, -27.2889, -19.6788, -21.1488, + -19.5144, -14.8889, -21.2006, -24.7488, -27.7940, -31.1058, -27.5068, + -21.5737, -22.3780, -21.5151, -26.3086, -30.9223, -33.5043, -32.0307, + -37.3806, -41.6188, -45.6650, -40.5131, -32.5023, -26.7385, -26.3709, + -26.7761 + ] + ], + [ + # "repeatpad" + [ + -25.7496, -24.9339, -24.1357, -23.1271, -23.7853, -26.1264, -29.1456, + -33.2060, -37.8179, -42.4833, -41.9386, -41.2164, -42.3566, -44.2575, + -40.0217, -36.6794, -36.6974, -38.7819, -42.0880, -45.5560, -39.9368, + -36.3219, -35.5981, -36.6434, -35.1851, -33.0684, -30.0437, -30.2010, + -34.3476, -42.1373, -38.8039, -37.3355, -40.4576, -41.0485, -40.6377, + -38.2275, -42.7481, -34.6084, -34.7048, -29.5149, -26.3935, -26.8952, + -34.1336, -26.2904, -28.2571, -32.5642, -36.7240, -35.5334, -38.2451, + -34.8177, -28.9754, -25.1096, -27.9768, -32.3184, -37.0269, -40.5136, + -40.8061, -36.4948, -40.3767, -38.9671, -38.3552, -34.1250, -30.9035, + -31.6112 + ], + [ + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100. + ] + ], + [ + # None, same as "repeatpad" + [ + -25.7496, -24.9339, -24.1357, -23.1271, -23.7853, -26.1264, -29.1456, + -33.2060, -37.8179, -42.4833, -41.9386, -41.2164, -42.3566, -44.2575, + -40.0217, -36.6794, -36.6974, -38.7819, -42.0880, -45.5560, -39.9368, + -36.3219, -35.5981, -36.6434, -35.1851, -33.0684, -30.0437, -30.2010, + -34.3476, -42.1373, -38.8039, -37.3355, -40.4576, -41.0485, -40.6377, + -38.2275, -42.7481, -34.6084, -34.7048, -29.5149, -26.3935, -26.8952, + -34.1336, -26.2904, -28.2571, -32.5642, -36.7240, -35.5334, -38.2451, + -34.8177, -28.9754, -25.1096, -27.9768, -32.3184, -37.0269, -40.5136, + -40.8061, -36.4948, -40.3767, -38.9671, -38.3552, -34.1250, -30.9035, + -31.6112 + ], + [ + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100. + ] + ], + [ + # "pad" + [ + -58.5260, -58.1155, -57.8623, -57.5059, -57.9178, -58.7171, -59.2343, + -59.9833, -60.9764, -62.0722, -63.5723, -65.7111, -67.5153, -68.7088, + -69.8325, -70.2987, -70.1548, -70.6233, -71.5702, -72.5159, -72.3821, + -70.1817, -67.0315, -64.1387, -62.2202, -61.0717, -60.4951, -61.6005, + -63.7358, -67.1400, -67.6185, -65.5635, -64.3593, -63.7138, -63.6209, + -66.4950, -72.6284, -63.3961, -56.8334, -52.7319, -50.6310, -51.3728, + -53.5619, -51.9190, -50.9708, -52.8684, -55.8073, -58.8227, -60.6991, + -57.0547, -52.7611, -51.4388, -54.4892, -60.8950, -66.1024, -72.4352, + -67.8538, -65.1463, -68.7588, -72.3080, -68.4864, -60.4688, -57.1516, + -60.9460 + ], + [ + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100. + ] + ] + ] + ) + # fmt: on + MEL_BIN = [[976, 977], [976, 977], [976, 977], [196, 197]] + input_speech = self._load_datasamples(1) + feature_extractor = ClapFeatureExtractor() + for padding, EXPECTED_VALUES, idx_in_mel in zip( + ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, MEL_BIN + ): + input_features = feature_extractor(input_speech, return_tensors="pt", padding=padding).input_features + self.assertEqual(input_features.shape, (1, 4, 1001, 64)) + + torch.testing.assert_close(input_features[0, 0, idx_in_mel[0]], EXPECTED_VALUES[0], rtol=1e-4, atol=1e-4) + torch.testing.assert_close(input_features[0, 0, idx_in_mel[1]], EXPECTED_VALUES[1], rtol=1e-4, atol=1e-4) + + self.assertTrue(torch.all(input_features[0, 0] == input_features[0, 1])) + self.assertTrue(torch.all(input_features[0, 0] == input_features[0, 2])) + self.assertTrue(torch.all(input_features[0, 0] == input_features[0, 3])) + + def test_integration_rand_trunc_short_input(self): + # fmt: off + EXPECTED_INPUT_FEATURES = torch.tensor( + [ + [ + # "repeat" + [ + -35.0483, -35.7865, -38.2884, -40.0220, -42.5349, -44.9489, -43.2228, + -44.6499, -47.6253, -49.6983, -50.2127, -52.5483, -52.2223, -51.9157, + -49.4082, -51.2024, -57.0476, -56.2803, -58.1618, -60.7474, -55.0389, + -60.9514, -59.3080, -50.4419, -47.8172, -48.7570, -55.2552, -44.5036, + -44.1148, -50.8218, -51.0968, -52.9408, -51.1037, -48.9789, -47.5897, + -52.0915, -55.4216, -54.1529, -58.0149, -58.0866, -52.7798, -52.6154, + -45.9144, -46.2008, -40.7603, -41.1703, -50.2250, -55.4112, -59.4818, + -54.5795, -53.5552, -51.3668, -49.8358, -50.3186, -54.0452, -57.6030, + -61.1589, -61.6415, -63.2756, -66.5890, -62.8543, -58.0665, -56.7203, + -56.7632 + ], + [ + -47.1320, -37.9961, -34.0076, -36.7109, -47.9057, -48.4924, -43.8371, + -44.9728, -48.1689, -52.9141, -57.6077, -52.8520, -44.8502, -45.6764, + -51.8389, -56.4284, -54.6972, -53.4889, -55.6077, -58.7149, -60.3760, + -54.0136, -56.0730, -55.9870, -54.4017, -53.1094, -53.5640, -50.3064, + -49.9520, -49.3239, -48.1668, -53.4852, -50.4561, -50.8688, -55.1970, + -51.5538, -53.0260, -59.6933, -54.8183, -59.5895, -55.9589, -50.3761, + -44.1282, -44.1463, -43.8540, -39.1168, -45.3893, -49.5542, -53.1505, + -55.2870, -50.3921, -46.8511, -47.4444, -49.5633, -56.0034, -59.0815, + -59.0018, -63.7589, -69.5745, -71.5789, -64.0498, -56.0558, -54.3475, + -54.7004 + ] + ], + [ + # "repeatpad" + [ + -40.3184, -39.7186, -39.8807, -41.6508, -45.3613, -50.4785, -57.0297, + -60.4944, -59.1642, -58.9495, -60.4661, -62.5300, -58.4759, -55.2865, + -54.8973, -56.0780, -57.5482, -59.6557, -64.3309, -65.0330, -59.4941, + -56.8552, -55.0519, -55.9817, -56.9739, -55.2827, -54.5312, -51.4141, + -50.4289, -51.9131, -57.5821, -63.9979, -59.9180, -58.9489, -62.3247, + -62.6975, -63.7948, -60.5250, -64.6107, -58.7905, -57.0229, -54.3084, + -49.8445, -50.4459, -57.0172, -50.6425, -52.5992, -57.4207, -61.6358, + -60.6540, -63.1968, -57.4360, -52.3263, -51.7695, -57.1946, -62.9610, + -66.7359, -67.0335, -63.7440, -68.1775, -66.3798, -62.8650, -59.8972, + -59.3139 + ], + [ + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100. + ] + ], + [ + # None, same as "repeatpad" + [ + -40.3184, -39.7186, -39.8807, -41.6508, -45.3613, -50.4785, -57.0297, + -60.4944, -59.1642, -58.9495, -60.4661, -62.5300, -58.4759, -55.2865, + -54.8973, -56.0780, -57.5482, -59.6557, -64.3309, -65.0330, -59.4941, + -56.8552, -55.0519, -55.9817, -56.9739, -55.2827, -54.5312, -51.4141, + -50.4289, -51.9131, -57.5821, -63.9979, -59.9180, -58.9489, -62.3247, + -62.6975, -63.7948, -60.5250, -64.6107, -58.7905, -57.0229, -54.3084, + -49.8445, -50.4459, -57.0172, -50.6425, -52.5992, -57.4207, -61.6358, + -60.6540, -63.1968, -57.4360, -52.3263, -51.7695, -57.1946, -62.9610, + -66.7359, -67.0335, -63.7440, -68.1775, -66.3798, -62.8650, -59.8972, + -59.3139 + ], + [ + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100. + ] + ], + [ + # "pad" + [ + -73.3190, -73.6349, -74.1451, -74.8539, -75.7476, -76.5438, -78.5540, + -80.1339, -81.8911, -83.7560, -85.5387, -86.7466, -88.2072, -88.6090, + -88.8243, -89.0784, -89.4364, -89.8179, -91.3146, -92.2833, -91.7221, + -90.9440, -88.1315, -86.2425, -84.2281, -82.4893, -81.5993, -81.1328, + -81.5759, -83.1068, -85.6525, -88.9520, -88.9187, -87.2703, -86.3052, + -85.7188, -85.8802, -87.9996, -95.0464, -88.0133, -80.8561, -76.5597, + -74.2816, -74.8109, -77.3615, -76.0719, -75.3426, -77.6428, -80.9663, + -84.5275, -84.9907, -80.5205, -77.2851, -78.6259, -84.7740, -91.4535, + -98.1894, -94.3872, -92.3735, -97.6807, -98.1501, -91.4344, -85.2842, + -88.4338 + ], + [ + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100., -100., -100., -100., -100., -100., -100., + -100., -100., -100., -100. + ] + ] + ] + ) + # fmt: on + MEL_BIN = [[976, 977], [976, 977], [976, 977], [196, 197]] + input_speech = self._load_datasamples(1) + feature_extractor = ClapFeatureExtractor() + for padding, EXPECTED_VALUES, idx_in_mel in zip( + ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, MEL_BIN + ): + input_features = feature_extractor( + input_speech, return_tensors="pt", truncation="rand_trunc", padding=padding + ).input_features + self.assertEqual(input_features.shape, (1, 1, 1001, 64)) + torch.testing.assert_close(input_features[0, 0, idx_in_mel[0]], EXPECTED_VALUES[0], rtol=1e-4, atol=1e-4) + torch.testing.assert_close(input_features[0, 0, idx_in_mel[1]], EXPECTED_VALUES[1], rtol=1e-4, atol=1e-4) + + def test_integration_fusion_long_input(self): + # fmt: off + EXPECTED_INPUT_FEATURES = torch.tensor( + [ + [ + -11.1830, -10.1894, -8.6051, -4.8578, -1.3268, -8.4606, -14.5453, + -9.2017, 0.5781, 16.2129, 14.8289, 3.6326, -3.8794, -6.5544, + -2.4408, 1.9531, 6.0967, 1.7590, -7.6730, -6.1571, 2.0052, + 16.6694, 20.6447, 21.2145, 13.4972, 15.9043, 16.8987, 4.1766, + 11.9428, 21.2372, 12.3016, 4.8604, 6.7241, 1.8543, 4.9235, + 5.3188, -0.9897, -1.2416, -6.5864, 2.9529, 2.9274, 6.4753, + 10.2300, 11.2127, 3.4042, -1.0055, -6.0475, -6.7524, -3.9801, + -1.4434, 0.4740, -0.1584, -4.5457, -8.5746, -8.8428, -13.1475, + -9.6079, -8.5798, -4.1143, -3.7966, -7.1651, -6.1517, -8.0258, + -12.1486 + ], + [ + -10.2017, -7.9924, -5.9517, -3.9372, -1.9735, -4.3130, 16.1647, + 25.0592, 23.5532, 14.4974, -7.0778, -10.2262, 6.4782, 20.3454, + 19.4269, 1.7976, -16.5070, 4.9380, 12.3390, 6.9285, -13.6325, + -8.5298, 1.0839, -5.9629, -8.4812, 3.1331, -2.0963, -16.6046, + -14.0070, -17.5707, -13.2080, -17.2168, -17.7770, -12.1111, -18.6184, + -17.1897, -13.9801, -12.0426, -23.5400, -25.6823, -23.5813, -18.7847, + -20.5473, -25.6458, -19.7585, -27.6007, -28.9276, -24.8948, -25.4458, + -22.2807, -19.6613, -19.2669, -15.7813, -19.6821, -24.3439, -22.2598, + -28.2631, -30.1017, -32.7646, -33.6525, -27.5639, -22.0548, -27.8054, + -29.6947 + ], + [ + -9.2078, -7.2963, -6.2095, -7.9959, -2.9280, -11.1843, -6.1490, + 5.0733, 19.2957, 21.4578, 14.6803, -3.3153, -6.3334, -2.3542, + 6.9509, 15.2965, 14.6620, 5.2075, -0.0873, 1.1919, 18.1986, + 20.8470, 10.8035, 2.2516, 7.6905, 7.7427, -1.2543, -5.0018, + 0.9809, -2.1584, -5.4580, -5.4760, -11.8888, -9.0605, -8.4638, + -9.9897, -0.0540, -5.1629, 0.0483, -4.1504, -4.8140, -7.8236, + -9.0622, -10.1742, -8.9597, -11.5380, -16.5603, -17.1858, -17.5032, + -20.9326, -23.9543, -25.2602, -25.3429, -27.4536, -26.8859, -22.7852, + -25.8288, -24.8399, -23.8893, -24.2096, -26.5415, -23.7281, -25.6851, + -22.3629 + ], + [ + 1.3448, 2.9883, 4.0366, -0.8019, -10.4191, -10.0883, -4.3812, + 0.8136, 2.1579, 0.0832, 1.0949, -0.9759, -5.5319, -4.6009, + -6.5452, -14.9155, -20.1584, -9.3611, -2.4271, 1.4031, 4.9910, + 8.6916, 8.6785, 10.1973, 9.9029, 5.3840, 7.5336, 5.2803, + 2.8144, -0.3138, 2.2216, 5.7328, 7.5574, 7.7402, 1.0681, + 3.1049, 7.0742, 6.5588, 7.3712, 5.7881, 8.6874, 8.7725, + 2.8133, -4.5809, -6.1317, -5.1719, -5.0192, -9.0977, -10.9391, + -6.0769, 1.6016, -0.8965, -7.2252, -7.8632, -11.4468, -11.7446, + -10.7447, -7.0601, -2.7748, -4.1798, -2.8433, -3.1352, 0.8097, + 6.4212 + ] + ] + ) + # fmt: on + MEL_BIN = 963 + input_speech = torch.cat([torch.tensor(x) for x in self._load_datasamples(5)]) + feature_extractor = ClapFeatureExtractor() + for padding, EXPECTED_VALUES, block_idx in zip( + ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, [1, 2, 0, 3] + ): + set_seed(987654321) + input_features = feature_extractor(input_speech, return_tensors="pt", padding=padding).input_features + self.assertEqual(input_features.shape, (1, 4, 1001, 64)) + torch.testing.assert_close(input_features[0, block_idx, MEL_BIN], EXPECTED_VALUES, rtol=1e-3, atol=1e-3) + + def test_integration_rand_trunc_long_input(self): + # fmt: off + EXPECTED_INPUT_FEATURES = torch.tensor( + [ + [ + -35.4022, -32.7555, -31.2004, -32.7764, -42.5770, -41.6339, -43.1630, + -44.5080, -44.3029, -48.9628, -39.5022, -39.2105, -43.1350, -43.2195, + -48.4894, -52.2344, -57.6891, -52.2228, -45.5155, -44.2893, -43.4697, + -46.6702, -43.7490, -40.4819, -42.7275, -46.3434, -46.8412, -41.2003, + -43.1681, -46.2948, -46.1925, -47.8333, -45.6812, -44.9182, -41.7786, + -43.3809, -44.3199, -42.8814, -45.4771, -46.7114, -46.9746, -42.7090, + -41.6057, -38.3965, -40.1980, -41.0263, -34.1256, -28.3289, -29.0201, + -30.4453, -29.5561, -30.1734, -25.9406, -19.0897, -15.8452, -20.1351, + -23.6515, -23.1194, -17.1845, -19.4399, -23.6527, -22.8768, -20.7279, + -22.7864 + ], + [ + -35.7719, -27.2566, -23.6964, -27.5521, 0.2510, 7.4391, 1.3917, + -13.3417, -28.1758, -17.0856, -5.7723, -0.8000, -7.8832, -15.5548, + -30.5935, -24.7571, -13.7009, -10.3432, -21.2464, -24.8118, -19.4080, + -14.9779, -11.7991, -18.4485, -20.1982, -17.3652, -20.6328, -28.2967, + -25.7819, -21.8962, -28.5083, -29.5719, -30.2120, -35.7033, -31.8218, + -34.0408, -37.7744, -33.9653, -31.3009, -30.9063, -28.6153, -32.2202, + -28.5456, -28.8579, -32.5170, -37.9152, -43.0052, -46.4849, -44.0786, + -39.1933, -33.2757, -31.6313, -42.6386, -52.3679, -53.5785, -55.6444, + -47.0050, -47.6459, -56.6361, -60.6781, -61.5244, -55.8272, -60.4832, + -58.1897 + ], + [ + -38.2686, -36.6285, -32.5835, -35.1693, -37.7938, -37.4035, -35.3132, + -35.6083, -36.3609, -40.9472, -36.7846, -36.1544, -38.9076, -39.3618, + -35.4953, -34.2809, -39.9466, -39.7433, -34.8347, -37.5674, -41.5689, + -38.9161, -34.3947, -30.2924, -30.4841, -34.5831, -28.9261, -24.8849, + -31.2324, -27.1622, -27.2107, -25.9385, -30.1691, -30.9223, -23.9495, + -25.6047, -26.7119, -28.5523, -27.7481, -32.8427, -35.4650, -31.0399, + -31.2073, -30.5163, -22.9819, -20.8892, -19.2510, -24.7905, -28.9426, + -28.1998, -26.7386, -25.0140, -27.9223, -32.9913, -33.1864, -34.9742, + -38.5995, -39.6990, -29.3203, -22.4697, -25.6415, -33.5608, -33.0945, + -27.1716 + ], + [ + -33.2015, -28.7741, -21.9457, -23.4888, -32.1072, -8.6307, 3.2724, + 5.9157, -0.9221, -30.1814, -31.0015, -27.4508, -27.0477, -9.5342, + 0.3221, 0.6511, -7.1596, -25.9707, -32.8924, -32.2300, -13.8974, + -0.4895, 0.9168, -10.7663, -27.1176, -35.0829, -11.6859, -4.8855, + -11.8898, -26.6167, -5.6192, -3.8443, -19.7947, -14.4101, -8.6236, + -21.2458, -21.0801, -17.9136, -24.4663, -18.6333, -24.8085, -15.5854, + -15.4344, -11.5046, -22.3625, -27.3387, -32.4353, -30.9670, -31.3789, + -35.4044, -34.4591, -25.2433, -28.0773, -33.8736, -33.0224, -33.3155, + -38.5302, -39.2741, -36.6395, -34.7729, -32.4483, -42.4001, -49.2857, + -39.1682 + ] + ] + ) + # fmt: on + MEL_BIN = 963 + SEEDS = [987654321, 1234, 666, 5555] + input_speech = torch.cat([torch.tensor(x) for x in self._load_datasamples(5)]) + feature_extractor = ClapFeatureExtractor() + for padding, EXPECTED_VALUES, seed in zip( + ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, SEEDS + ): + set_seed(seed) + input_features = feature_extractor( + input_speech, return_tensors="pt", truncation="rand_trunc", padding=padding + ).input_features + self.assertEqual(input_features.shape, (1, 1, 1001, 64)) + torch.testing.assert_close(input_features[0, 0, MEL_BIN], EXPECTED_VALUES, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/clap/test_modeling_clap.py b/docs/transformers/tests/models/clap/test_modeling_clap.py new file mode 100644 index 0000000000000000000000000000000000000000..e828a54827c50a7057cb4d472858803e2798d393 --- /dev/null +++ b/docs/transformers/tests/models/clap/test_modeling_clap.py @@ -0,0 +1,755 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch CLAP model.""" + +import inspect +import os +import tempfile +import unittest + +import numpy as np +from datasets import load_dataset + +from transformers import ClapAudioConfig, ClapConfig, ClapProcessor, ClapTextConfig +from transformers.testing_utils import require_torch, slow, torch_device +from transformers.utils import is_torch_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + _config_zero_init, + floats_tensor, + ids_tensor, + random_attention_mask, +) +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ( + ClapAudioModel, + ClapAudioModelWithProjection, + ClapModel, + ClapTextModel, + ClapTextModelWithProjection, + ) + + +class ClapAudioModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=60, + num_mel_bins=16, + window_size=4, + spec_size=64, + patch_size=2, + patch_stride=2, + seq_length=16, + freq_ratio=2, + num_channels=3, + is_training=True, + hidden_size=32, + patch_embeds_hidden_size=16, + projection_dim=32, + depths=[2, 2], + num_hidden_layers=2, + num_heads=[2, 2], + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_mel_bins = num_mel_bins + self.window_size = window_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.depths = depths + self.num_heads = num_heads + self.num_attention_heads = num_heads[0] + self.seq_length = seq_length + self.spec_size = spec_size + self.freq_ratio = freq_ratio + self.patch_stride = patch_stride + self.patch_embeds_hidden_size = patch_embeds_hidden_size + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + input_features = floats_tensor([self.batch_size, 1, self.hidden_size, self.num_mel_bins]) + config = self.get_config() + + return config, input_features + + def get_config(self): + return ClapAudioConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_mel_bins=self.num_mel_bins, + window_size=self.window_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + patch_stride=self.patch_stride, + projection_dim=self.projection_dim, + depths=self.depths, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + spec_size=self.spec_size, + freq_ratio=self.freq_ratio, + patch_embeds_hidden_size=self.patch_embeds_hidden_size, + ) + + def create_and_check_model(self, config, input_features): + model = ClapAudioModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(input_features) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_model_with_projection(self, config, input_features): + model = ClapAudioModelWithProjection(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(input_features) + self.parent.assertEqual(result.audio_embeds.shape, (self.batch_size, self.projection_dim)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_features = config_and_inputs + inputs_dict = {"input_features": input_features} + return config, inputs_dict + + +@require_torch +class ClapAudioModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as CLAP does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (ClapAudioModel, ClapAudioModelWithProjection) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = ClapAudioModelTester(self) + self.config_tester = ConfigTester(self, config_class=ClapAudioConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="ClapAudioModel does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [2 * self.model_tester.patch_embeds_hidden_size, 2 * self.model_tester.patch_embeds_hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass") + def test_retain_grad_hidden_states_attentions(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["input_features"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_with_projection(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_with_projection(*config_and_inputs) + + @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass") + def test_training(self): + pass + + @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass") + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "laion/clap-htsat-fused" + model = ClapAudioModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @slow + def test_model_with_projection_from_pretrained(self): + model_name = "laion/clap-htsat-fused" + model = ClapAudioModelWithProjection.from_pretrained(model_name) + self.assertIsNotNone(model) + self.assertTrue(hasattr(model, "audio_projection")) + + +class ClapTextModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + scope=None, + projection_hidden_act="relu", + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + self.projection_hidden_act = projection_hidden_act + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + config = self.get_config() + + return config, input_ids, input_mask + + def get_config(self): + return ClapTextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + projection_hidden_act=self.projection_hidden_act, + ) + + def create_and_check_model(self, config, input_ids, input_mask): + model = ClapTextModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_model_with_projection(self, config, input_ids, input_mask): + model = ClapTextModelWithProjection(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, input_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class ClapTextModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (ClapTextModel, ClapTextModelWithProjection) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_head_masking = False + + def setUp(self): + self.model_tester = ClapTextModelTester(self) + self.config_tester = ConfigTester(self, config_class=ClapTextConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_with_projection(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_with_projection(*config_and_inputs) + + @unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass") + def test_training(self): + pass + + @unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass") + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="ClapTextModel does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "laion/clap-htsat-fused" + model = ClapTextModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @slow + def test_model_with_projection_from_pretrained(self): + model_name = "laion/clap-htsat-fused" + model = ClapTextModelWithProjection.from_pretrained(model_name) + self.assertIsNotNone(model) + self.assertTrue(hasattr(model, "text_projection")) + + +class ClapModelTester: + def __init__(self, parent, text_kwargs=None, audio_kwargs=None, is_training=True): + if text_kwargs is None: + text_kwargs = {} + if audio_kwargs is None: + audio_kwargs = {} + + self.parent = parent + self.text_model_tester = ClapTextModelTester(parent, **text_kwargs) + self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test + self.is_training = is_training + + def prepare_config_and_inputs(self): + _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + _, input_features = self.audio_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, attention_mask, input_features + + def get_config(self): + return ClapConfig.from_text_audio_configs( + self.text_model_tester.get_config(), self.audio_model_tester.get_config(), projection_dim=64 + ) + + def create_and_check_model(self, config, input_ids, attention_mask, input_features): + model = ClapModel(config).to(torch_device).eval() + with torch.no_grad(): + result = model(input_ids, input_features, attention_mask) + self.parent.assertEqual( + result.logits_per_audio.shape, (self.audio_model_tester.batch_size, self.text_model_tester.batch_size) + ) + self.parent.assertEqual( + result.logits_per_text.shape, (self.text_model_tester.batch_size, self.audio_model_tester.batch_size) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, input_features = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "input_features": input_features, + "return_loss": True, + } + return config, inputs_dict + + +@require_torch +class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (ClapModel,) if is_torch_available() else () + pipeline_model_mapping = {"feature-extraction": ClapModel} if is_torch_available() else {} + fx_compatible = False + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + + def setUp(self): + self.model_tester = ClapModelTester(self) + common_properties = ["logit_scale_init_value", "projection_hidden_act", "projection_dim"] + self.config_tester = ConfigTester( + self, config_class=ClapConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Hidden_states is tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="Inputs_embeds is tested in individual model tests") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="ClapModel does not have input/output embeddings") + def test_model_get_set_embeddings(self): + pass + + # override as the `logit_scale` parameter initialization is different for CLAP + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + # check if `logit_scale` is initialized as per the original implementation + if name == "logit_scale": + self.assertAlmostEqual( + param.data.item(), + np.log(1 / 0.07), + delta=1e-3, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def _create_and_check_torchscript(self, config, inputs_dict): + if not self.test_torchscript: + self.skipTest(reason="test_torchscript is set to False") + + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.torchscript = True + configs_no_init.return_dict = False + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + + try: + input_ids = inputs_dict["input_ids"] + input_features = inputs_dict["input_features"] # CLAP needs input_features + traced_model = torch.jit.trace(model, (input_ids, input_features)) + except RuntimeError: + self.fail("Couldn't trace module.") + + with tempfile.TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") + + try: + torch.jit.save(traced_model, pt_file_name) + except Exception: + self.fail("Couldn't save module.") + + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load module.") + + model.to(torch_device) + model.eval() + + loaded_model.to(torch_device) + loaded_model.eval() + + model_state_dict = model.state_dict() + loaded_model_state_dict = loaded_model.state_dict() + + non_persistent_buffers = {} + for key in loaded_model_state_dict.keys(): + if key not in model_state_dict.keys(): + non_persistent_buffers[key] = loaded_model_state_dict[key] + + loaded_model_state_dict = { + key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers + } + + self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) + + model_buffers = list(model.buffers()) + for non_persistent_buffer in non_persistent_buffers.values(): + found_buffer = False + for i, model_buffer in enumerate(model_buffers): + if torch.equal(non_persistent_buffer, model_buffer): + found_buffer = True + break + + self.assertTrue(found_buffer) + model_buffers.pop(i) + + models_equal = True + for layer_name, p1 in model_state_dict.items(): + p2 = loaded_model_state_dict[layer_name] + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + def test_load_audio_text_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Save ClapConfig and check if we can load ClapAudioConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + audio_config = ClapAudioConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.audio_config.to_dict(), audio_config.to_dict()) + + # Save ClapConfig and check if we can load ClapTextConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + text_config = ClapTextConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict()) + + @slow + def test_model_from_pretrained(self): + model_name = "laion/clap-htsat-fused" + model = ClapModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@slow +@require_torch +class ClapModelIntegrationTest(unittest.TestCase): + paddings = ["repeatpad", "repeat", "pad"] + + def test_integration_unfused(self): + EXPECTED_MEANS_UNFUSED = { + "repeatpad": 0.0024, + "pad": 0.0020, + "repeat": 0.0023, + } + + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + audio_sample = librispeech_dummy[-1] + + model_id = "laion/clap-htsat-unfused" + + model = ClapModel.from_pretrained(model_id).to(torch_device) + processor = ClapProcessor.from_pretrained(model_id) + + for padding in self.paddings: + inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding).to( + torch_device + ) + + audio_embed = model.get_audio_features(**inputs) + expected_mean = EXPECTED_MEANS_UNFUSED[padding] + + self.assertTrue( + torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3) + ) + + def test_integration_fused(self): + EXPECTED_MEANS_FUSED = { + "repeatpad": 0.00069, + "repeat": 0.00196, + "pad": -0.000379, + } + + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + audio_sample = librispeech_dummy[-1] + + model_id = "laion/clap-htsat-fused" + + model = ClapModel.from_pretrained(model_id).to(torch_device) + processor = ClapProcessor.from_pretrained(model_id) + + for padding in self.paddings: + inputs = processor( + audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding, truncation="fusion" + ).to(torch_device) + + audio_embed = model.get_audio_features(**inputs) + expected_mean = EXPECTED_MEANS_FUSED[padding] + + self.assertTrue( + torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3) + ) + + def test_batched_fused(self): + EXPECTED_MEANS_FUSED = { + "repeatpad": 0.0010, + "repeat": 0.0020, + "pad": 0.0006, + } + + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] + + model_id = "laion/clap-htsat-fused" + + model = ClapModel.from_pretrained(model_id).to(torch_device) + processor = ClapProcessor.from_pretrained(model_id) + + for padding in self.paddings: + inputs = processor(audios=audio_samples, return_tensors="pt", padding=padding, truncation="fusion").to( + torch_device + ) + + audio_embed = model.get_audio_features(**inputs) + expected_mean = EXPECTED_MEANS_FUSED[padding] + + self.assertTrue( + torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3) + ) + + def test_batched_unfused(self): + EXPECTED_MEANS_FUSED = { + "repeatpad": 0.0016, + "repeat": 0.0019, + "pad": 0.0019, + } + + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] + + model_id = "laion/clap-htsat-unfused" + + model = ClapModel.from_pretrained(model_id).to(torch_device) + processor = ClapProcessor.from_pretrained(model_id) + + for padding in self.paddings: + inputs = processor(audios=audio_samples, return_tensors="pt", padding=padding).to(torch_device) + + audio_embed = model.get_audio_features(**inputs) + expected_mean = EXPECTED_MEANS_FUSED[padding] + + self.assertTrue( + torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3) + ) diff --git a/docs/transformers/tests/models/clap/test_processor_clap.py b/docs/transformers/tests/models/clap/test_processor_clap.py new file mode 100644 index 0000000000000000000000000000000000000000..49e9972ea02e22a0661ab0d8ef71cc1cfe29b291 --- /dev/null +++ b/docs/transformers/tests/models/clap/test_processor_clap.py @@ -0,0 +1,125 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest + +from transformers import ClapFeatureExtractor, ClapProcessor, RobertaTokenizer, RobertaTokenizerFast +from transformers.testing_utils import require_sentencepiece, require_torchaudio + +from .test_feature_extraction_clap import floats_list + + +@require_torchaudio +@require_sentencepiece +class ClapProcessorTest(unittest.TestCase): + def setUp(self): + self.checkpoint = "laion/clap-htsat-unfused" + self.tmpdirname = tempfile.mkdtemp() + + def get_tokenizer(self, **kwargs): + return RobertaTokenizer.from_pretrained(self.checkpoint, **kwargs) + + def get_feature_extractor(self, **kwargs): + return ClapFeatureExtractor.from_pretrained(self.checkpoint, **kwargs) + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def test_save_load_pretrained_default(self): + tokenizer = self.get_tokenizer() + feature_extractor = self.get_feature_extractor() + + processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + processor.save_pretrained(self.tmpdirname) + processor = ClapProcessor.from_pretrained(self.tmpdirname) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) + self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast) + + self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string()) + self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor) + + def test_save_load_pretrained_additional_features(self): + processor = ClapProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()) + processor.save_pretrained(self.tmpdirname) + + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0) + + processor = ClapProcessor.from_pretrained( + self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast) + + self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor) + + def test_feature_extractor(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + raw_speech = floats_list((3, 1000)) + + input_feat_extract = feature_extractor(raw_speech, return_tensors="np") + input_processor = processor(audios=raw_speech, return_tensors="np") + + for key in input_feat_extract.keys(): + self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) + + def test_tokenizer(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + input_str = "This is a test string" + + encoded_processor = processor(text=input_str) + + encoded_tok = tokenizer(input_str) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + def test_tokenizer_decode(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + def test_model_input_names(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + self.assertListEqual( + processor.model_input_names[2:], + feature_extractor.model_input_names, + msg="`processor` and `feature_extractor` model input names do not match", + ) diff --git a/docs/transformers/tests/models/clip/__init__.py b/docs/transformers/tests/models/clip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/clip/test_image_processing_clip.py b/docs/transformers/tests/models/clip/test_image_processing_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..86f53a7489be61aef3bf3727338094739ba00ede --- /dev/null +++ b/docs/transformers/tests/models/clip/test_image_processing_clip.py @@ -0,0 +1,128 @@ +# Copyright 2021 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_vision_available(): + from transformers import CLIPImageProcessor + + if is_torchvision_available(): + from transformers import CLIPImageProcessorFast + + +class CLIPImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + do_convert_rgb=True, + ): + super().__init__() + size = size if size is not None else {"shortest_edge": 20} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.crop_size["height"], self.crop_size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class CLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = CLIPImageProcessor if is_vision_available() else None + fast_image_processing_class = CLIPImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = CLIPImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 20}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) + self.assertEqual(image_processor.size, {"shortest_edge": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) diff --git a/docs/transformers/tests/models/clip/test_modeling_clip.py b/docs/transformers/tests/models/clip/test_modeling_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..739ae1f5903d34f11ce3c1197c8d66b80a7dbb49 --- /dev/null +++ b/docs/transformers/tests/models/clip/test_modeling_clip.py @@ -0,0 +1,948 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch CLIP model.""" + +import inspect +import os +import tempfile +import unittest + +import numpy as np +import requests +from parameterized import parameterized +from pytest import mark + +from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig +from transformers.testing_utils import ( + require_flash_attn, + require_torch, + require_torch_gpu, + require_torch_sdpa, + require_vision, + slow, + torch_device, +) +from transformers.utils import ( + is_torch_available, + is_vision_available, +) + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION, + ModelTesterMixin, + _config_zero_init, + floats_tensor, + ids_tensor, + is_flaky, + random_attention_mask, +) +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ( + CLIPForImageClassification, + CLIPModel, + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPVisionModel, + CLIPVisionModelWithProjection, + ) + +if is_vision_available(): + from PIL import Image + + from transformers import CLIPProcessor + + +class CLIPVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return CLIPVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = CLIPVisionModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_model_with_projection(self, config, pixel_values): + model = CLIPVisionModelWithProjection(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) + @require_torch_sdpa + def test_eager_matches_sdpa_inference(self, *args): + return getattr(ModelTesterMixin, self._testMethodName)(self) + + +class CLIPModelTesterMixin(ModelTesterMixin): + """ + Subclass of ModelTesterMixin with methods specific to testing CLIP models. + The SDPA equivalence test is overridden here because CLIP models may have test/vision/text+vision inputs, + different output logits, and are not supposed to be used or tested with padding_side="left". + """ + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + # Load the model with SDPA (it is the default, but we explicit it for clarity) + model_sdpa = model_class.from_pretrained(tmpdirname, attn_implementation="sdpa") + model_sdpa = model_sdpa.eval().to(torch_device) + + # Load model with eager attention + model_eager = model_class.from_pretrained( + tmpdirname, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + if hasattr(model_sdpa, "vision_model"): + self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa") + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + + if hasattr(model_sdpa, "text_model"): + self.assertTrue(model_sdpa.text_model.config._attn_implementation == "sdpa") + self.assertTrue(model_eager.text_model.config._attn_implementation == "eager") + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model_eager.config._attn_implementation == "eager") + + +@require_torch +class CLIPVisionModelTest(CLIPModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (CLIPVisionModel, CLIPVisionModelWithProjection) if is_torch_available() else () + fx_compatible = True + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = CLIPVisionModelTester(self) + self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="CLIP does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_with_projection(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_with_projection(*config_and_inputs) + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "openai/clip-vit-base-patch32" + model = CLIPVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @slow + def test_model_with_projection_from_pretrained(self): + model_name = "openai/clip-vit-base-patch32" + model = CLIPVisionModelWithProjection.from_pretrained(model_name) + self.assertIsNotNone(model) + self.assertTrue(hasattr(model, "visual_projection")) + + @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) + @require_torch_sdpa + @is_flaky() + def test_eager_matches_sdpa_inference(self, *args): + # adding only flaky decorator here and call the parent test method + return getattr(ModelTesterMixin, self._testMethodName)(self) + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + + +class CLIPTextModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + config = self.get_config() + + return config, input_ids, input_mask + + def get_config(self): + return CLIPTextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, input_ids, input_mask): + model = CLIPTextModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_model_with_projection(self, config, input_ids, input_mask): + model = CLIPTextModelWithProjection(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, input_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class CLIPTextModelTest(CLIPModelTesterMixin, unittest.TestCase): + all_model_classes = (CLIPTextModel, CLIPTextModelWithProjection) if is_torch_available() else () + fx_compatible = True + test_pruning = False + test_head_masking = False + model_split_percents = [0.5, 0.8, 0.9] + + def setUp(self): + self.model_tester = CLIPTextModelTester(self) + self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_with_projection(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_with_projection(*config_and_inputs) + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="CLIP does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "openai/clip-vit-base-patch32" + model = CLIPTextModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @slow + def test_model_with_projection_from_pretrained(self): + model_name = "openai/clip-vit-base-patch32" + model = CLIPTextModelWithProjection.from_pretrained(model_name) + self.assertIsNotNone(model) + self.assertTrue(hasattr(model, "text_projection")) + + @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) + @require_torch_sdpa + @slow + @is_flaky() + def test_eager_matches_sdpa_inference(self, *args): + # adding only flaky decorator here and call the parent test method + return getattr(ModelTesterMixin, self._testMethodName)(self) + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + + @require_torch_sdpa + def test_sdpa_can_dispatch_on_flash(self): + self.skipTest(reason="CLIPTextModel has two attention masks: `causal_attention_mask` and `attention_mask`") + + +class CLIPModelTester: + def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): + if text_kwargs is None: + text_kwargs = {} + if vision_kwargs is None: + vision_kwargs = {} + + self.parent = parent + self.text_model_tester = CLIPTextModelTester(parent, **text_kwargs) + self.vision_model_tester = CLIPVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test + self.is_training = is_training + + def prepare_config_and_inputs(self): + text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, attention_mask, pixel_values + + def get_config(self): + return CLIPConfig.from_text_vision_configs( + self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64 + ) + + def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): + model = CLIPModel(config).to(torch_device).eval() + with torch.no_grad(): + result = model(input_ids, pixel_values, attention_mask) + self.parent.assertEqual( + result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size) + ) + self.parent.assertEqual( + result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "return_loss": True, + } + return config, inputs_dict + + +@require_torch +class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (CLIPModel,) if is_torch_available() else () + pipeline_model_mapping = ( + {"feature-extraction": CLIPModel, "image-feature-extraction": CLIPVisionModel} if is_torch_available() else {} + ) + fx_compatible = True + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + _is_composite = True + + def setUp(self): + self.model_tester = CLIPModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=CLIPConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Hidden_states is tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="Inputs_embeds is tested in individual model tests") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="CLIPModel does not have input/output embeddings") + def test_model_get_set_embeddings(self): + pass + + # override as the `logit_scale` parameter initialization is different for CLIP + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + # check if `logit_scale` is initialized as per the original implementation + if name == "logit_scale": + self.assertAlmostEqual( + param.data.item(), + np.log(1 / 0.07), + delta=1e-3, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def _create_and_check_torchscript(self, config, inputs_dict): + if not self.test_torchscript: + self.skipTest(reason="test_torchscript is set to False") + + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.torchscript = True + configs_no_init.return_dict = False + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + + try: + input_ids = inputs_dict["input_ids"] + pixel_values = inputs_dict["pixel_values"] # CLIP needs pixel_values + traced_model = torch.jit.trace(model, (input_ids, pixel_values)) + except RuntimeError: + self.fail("Couldn't trace module.") + + with tempfile.TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") + + try: + torch.jit.save(traced_model, pt_file_name) + except Exception: + self.fail("Couldn't save module.") + + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load module.") + + model.to(torch_device) + model.eval() + + loaded_model.to(torch_device) + loaded_model.eval() + + model_state_dict = model.state_dict() + loaded_model_state_dict = loaded_model.state_dict() + + non_persistent_buffers = {} + for key in loaded_model_state_dict.keys(): + if key not in model_state_dict.keys(): + non_persistent_buffers[key] = loaded_model_state_dict[key] + + loaded_model_state_dict = { + key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers + } + + self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) + + model_buffers = list(model.buffers()) + for non_persistent_buffer in non_persistent_buffers.values(): + found_buffer = False + for i, model_buffer in enumerate(model_buffers): + if torch.equal(non_persistent_buffer, model_buffer): + found_buffer = True + break + + self.assertTrue(found_buffer) + model_buffers.pop(i) + + models_equal = True + for layer_name, p1 in model_state_dict.items(): + p2 = loaded_model_state_dict[layer_name] + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + def test_load_vision_text_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Save CLIPConfig and check if we can load CLIPVisionConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + vision_config = CLIPVisionConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict()) + + # Save CLIPConfig and check if we can load CLIPTextConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + text_config = CLIPTextConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict()) + + @slow + def test_model_from_pretrained(self): + model_name = "openai/clip-vit-base-patch32" + model = CLIPModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) + @require_torch_sdpa + @slow + @is_flaky() + def test_eager_matches_sdpa_inference(self, *args): + # adding only flaky decorator here and call the parent test method + return getattr(ModelTesterMixin, self._testMethodName)(self) + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + + @require_torch_sdpa + def test_sdpa_can_dispatch_on_flash(self): + self.skipTest(reason="CLIP text tower has two attention masks: `causal_attention_mask` and `attention_mask`") + + @require_torch_sdpa + def test_sdpa_can_compile_dynamic(self): + self.skipTest(reason="CLIP model can't be compiled dynamic, error in clip_loss`") + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + def test_flash_attn_2_inference_equivalence(self): + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16) + model.to(torch_device) + + dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16) + dummy_input_ids = inputs_dict["input_ids"] + + outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True) + outputs_fa = model_fa( + pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True + ) + + self.assertTrue( + torch.allclose(outputs.logits_per_image, outputs_fa.logits_per_image, atol=4e-2, rtol=4e-2), + f"Image logits max diff: {torch.max(torch.abs(outputs.logits_per_image - outputs_fa.logits_per_image))}", + ) + self.assertTrue( + torch.allclose(outputs.logits_per_text, outputs_fa.logits_per_text, atol=4e-2, rtol=4e-2), + f"Text logits max diff: {torch.max(torch.abs(outputs.logits_per_text - outputs_fa.logits_per_text))}", + ) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + def test_flash_attn_2_inference_equivalence_right_padding(self): + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="eager" + ) + model.to(torch_device) + + dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16) + dummy_input_ids = inputs_dict["input_ids"] + dummy_pixel_mask = inputs_dict["attention_mask"] + + # right padding + dummy_pixel_mask[:] = 1 + dummy_pixel_mask[:, -1:] = 0 + + outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True) + outputs_fa = model_fa( + pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True + ) + + logits_per_image_eager = outputs.logits_per_image[:, :-1] + logits_per_text_eager = outputs.logits_per_text[:, :-1] + + logits_per_image_sdpa = outputs_fa.logits_per_image[:, :-1] + logits_per_text_sdpa = outputs_fa.logits_per_text[:, :-1] + + self.assertTrue( + torch.allclose(logits_per_image_eager, logits_per_image_sdpa, atol=4e-2, rtol=4e-2), + f"Image logits max diff: {torch.max(torch.abs(logits_per_image_eager - logits_per_image_sdpa))}", + ) + self.assertTrue( + torch.allclose(logits_per_text_eager, logits_per_text_sdpa, atol=4e-2, rtol=4e-2), + f"Text logits max diff: {torch.max(torch.abs(logits_per_text_eager - logits_per_text_sdpa))}", + ) + + +class CLIPForImageClassificationModelTester(CLIPModelTester): + def __init__(self, parent): + super().__init__(parent) + self.batch_size = self.vision_model_tester.batch_size + self.num_hidden_layers = self.vision_model_tester.num_hidden_layers + self.hidden_size = self.vision_model_tester.hidden_size + self.seq_length = self.vision_model_tester.seq_length + + def prepare_config_and_inputs(self): + _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + config = self.get_config() + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class CLIPForImageClassificationModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (CLIPForImageClassification,) if is_torch_available() else () + pipeline_model_mapping = {"image-classification": CLIPForImageClassification} if is_torch_available() else {} + fx_compatible = False + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + _is_composite = True + + def setUp(self): + self.model_tester = CLIPForImageClassificationModelTester(self) + + @unittest.skip(reason="CLIPForImageClassification does not support inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="CLIPForImageClassification does not support inputs_embeds") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet") + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet") + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet") + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="CLIP uses the same initialization scheme as the Flax original implementation") + def test_initialization(self): + pass + + @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) + @require_torch_sdpa + @slow + @is_flaky() + def test_eager_matches_sdpa_inference(self, *args): + # adding only flaky decorator here and call the parent test method + return getattr(ModelTesterMixin, self._testMethodName)(self) + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + super().test_sdpa_can_dispatch_composite_models() + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@require_vision +@require_torch +class CLIPModelIntegrationTest(unittest.TestCase): + @slow + def test_inference(self): + model_name = "openai/clip-vit-base-patch32" + model = CLIPModel.from_pretrained(model_name, attn_implementation="sdpa").to(torch_device) + processor = CLIPProcessor.from_pretrained(model_name) + + image = prepare_img() + inputs = processor( + text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt" + ).to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + self.assertEqual( + outputs.logits_per_image.shape, + torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), + ) + self.assertEqual( + outputs.logits_per_text.shape, + torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), + ) + + expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device) + + torch.testing.assert_close(outputs.logits_per_image, expected_logits, rtol=1e-3, atol=1e-3) + + @slow + def test_inference_interpolate_pos_encoding(self): + # CLIP models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device) + + processor = CLIPProcessor.from_pretrained( + "openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 26, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]] + ).to(torch_device) + + torch.testing.assert_close( + outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=6e-3, atol=4e-4 + ) diff --git a/docs/transformers/tests/models/clip/test_modeling_flax_clip.py b/docs/transformers/tests/models/clip/test_modeling_flax_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..d499f4bf7dcb06dbbcb998cb59dd0da7496da60a --- /dev/null +++ b/docs/transformers/tests/models/clip/test_modeling_flax_clip.py @@ -0,0 +1,468 @@ +import inspect +import tempfile +import unittest + +import numpy as np + +from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig, is_flax_available +from transformers.testing_utils import require_flax, slow + +from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_flax_available(): + import jax + + from transformers.models.clip.modeling_flax_clip import ( + FlaxCLIPModel, + FlaxCLIPTextModel, + FlaxCLIPTextModelWithProjection, + FlaxCLIPVisionModel, + ) + + +class FlaxCLIPVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = CLIPVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_flax +class FlaxCLIPVisionModelTest(FlaxModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (FlaxCLIPVisionModel,) if is_flax_available() else () + + def setUp(self): + self.model_tester = FlaxCLIPVisionModelTester(self) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.__call__) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_jit_compilation(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + with self.subTest(model_class.__name__): + prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + + @jax.jit + def model_jitted(pixel_values, **kwargs): + return model(pixel_values=pixel_values, **kwargs).to_tuple() + + with self.subTest("JIT Enabled"): + jitted_outputs = model_jitted(**prepared_inputs_dict) + + with self.subTest("JIT Disabled"): + with jax.disable_jit(): + outputs = model_jitted(**prepared_inputs_dict) + + self.assertEqual(len(outputs), len(jitted_outputs)) + for jitted_output, output in zip(jitted_outputs, outputs): + self.assertEqual(jitted_output.shape, output.shape) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + hidden_states = outputs.hidden_states + + self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) + + # CLIP has a different seq_length + image_size = (self.model_tester.image_size, self.model_tester.image_size) + patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_length = num_patches + 1 + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token) + image_size = (self.model_tester.image_size, self.model_tester.image_size) + patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_length = num_patches + 1 + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_length, seq_length], + ) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_length, seq_length], + ) + + # FlaxCLIPVisionModel does not have any base model + def test_save_load_from_base(self): + pass + + # FlaxCLIPVisionModel does not have any base model + def test_save_load_to_base(self): + pass + + @slow + def test_model_from_pretrained(self): + for model_class_name in self.all_model_classes: + model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True) + outputs = model(np.ones((1, 3, 224, 224))) + self.assertIsNotNone(outputs) + + +class FlaxCLIPTextModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + config = CLIPTextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + ) + + return config, input_ids, input_mask + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, input_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_flax +class FlaxCLIPTextModelTest(FlaxModelTesterMixin, unittest.TestCase): + all_model_classes = (FlaxCLIPTextModel, FlaxCLIPTextModelWithProjection) if is_flax_available() else () + + def setUp(self): + self.model_tester = FlaxCLIPTextModelTester(self) + + # FlaxCLIPTextModel does not have any base model + def test_save_load_from_base(self): + pass + + # FlaxCLIPVisionModel does not have any base model + def test_save_load_to_base(self): + pass + + @slow + def test_model_from_pretrained(self): + for model_class_name in self.all_model_classes: + model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True) + outputs = model(np.ones((1, 1))) + self.assertIsNotNone(outputs) + + +class FlaxCLIPModelTester: + def __init__(self, parent, is_training=True): + self.parent = parent + self.text_model_tester = FlaxCLIPTextModelTester(parent) + self.vision_model_tester = FlaxCLIPVisionModelTester(parent) + self.is_training = is_training + + def prepare_config_and_inputs(self): + text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + + config = CLIPConfig.from_text_vision_configs(text_config, vision_config, projection_dim=64) + + return config, input_ids, attention_mask, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + } + return config, inputs_dict + + +@require_flax +class FlaxCLIPModelTest(FlaxModelTesterMixin, unittest.TestCase): + all_model_classes = (FlaxCLIPModel,) if is_flax_available() else () + test_attention_outputs = False + + def setUp(self): + self.model_tester = FlaxCLIPModelTester(self) + + # hidden_states are tested in individual model tests + def test_hidden_states_output(self): + pass + + def test_jit_compilation(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + with self.subTest(model_class.__name__): + prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + + @jax.jit + def model_jitted(input_ids, pixel_values, **kwargs): + return model(input_ids=input_ids, pixel_values=pixel_values, **kwargs).to_tuple() + + with self.subTest("JIT Enabled"): + jitted_outputs = model_jitted(**prepared_inputs_dict) + + with self.subTest("JIT Disabled"): + with jax.disable_jit(): + outputs = model_jitted(**prepared_inputs_dict) + + self.assertEqual(len(outputs), len(jitted_outputs)) + for jitted_output, output in zip(jitted_outputs[:4], outputs[:4]): + self.assertEqual(jitted_output.shape, output.shape) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.__call__) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["input_ids", "pixel_values", "attention_mask", "position_ids"] + self.assertListEqual(arg_names[:4], expected_arg_names) + + def test_get_image_features(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = FlaxCLIPModel(config) + + @jax.jit + def model_jitted(pixel_values): + return model.get_image_features(pixel_values=pixel_values) + + with self.subTest("JIT Enabled"): + jitted_output = model_jitted(inputs_dict["pixel_values"]) + + with self.subTest("JIT Disabled"): + with jax.disable_jit(): + output = model_jitted(inputs_dict["pixel_values"]) + + self.assertEqual(jitted_output.shape, output.shape) + self.assertTrue(np.allclose(jitted_output, output, atol=1e-3)) + + def test_get_text_features(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = FlaxCLIPModel(config) + + @jax.jit + def model_jitted(input_ids, attention_mask, **kwargs): + return model.get_text_features(input_ids=input_ids, attention_mask=attention_mask) + + with self.subTest("JIT Enabled"): + jitted_output = model_jitted(**inputs_dict) + + with self.subTest("JIT Disabled"): + with jax.disable_jit(): + output = model_jitted(**inputs_dict) + + self.assertEqual(jitted_output.shape, output.shape) + self.assertTrue(np.allclose(jitted_output, output, atol=1e-3)) + + @slow + def test_model_from_pretrained(self): + for model_class_name in self.all_model_classes: + model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True) + outputs = model(input_ids=np.ones((1, 1)), pixel_values=np.ones((1, 3, 224, 224))) + self.assertIsNotNone(outputs) + + # overwrite from common since FlaxCLIPModel returns nested output + # which is not supported in the common test + def test_from_pretrained_save_pretrained(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + if model_class.__name__ != "FlaxBertModel": + continue + + with self.subTest(model_class.__name__): + model = model_class(config) + + prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + outputs = model(**prepared_inputs_dict).to_tuple() + + # verify that normal save_pretrained works as expected + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_loaded = model_class.from_pretrained(tmpdirname) + + outputs_loaded = model_loaded(**prepared_inputs_dict).to_tuple()[:4] + for output_loaded, output in zip(outputs_loaded, outputs): + self.assert_almost_equals(output_loaded, output, 1e-3) + + # verify that save_pretrained for distributed training + # with `params=params` works as expected + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, params=model.params) + model_loaded = model_class.from_pretrained(tmpdirname) + + outputs_loaded = model_loaded(**prepared_inputs_dict).to_tuple()[:4] + for output_loaded, output in zip(outputs_loaded, outputs): + self.assert_almost_equals(output_loaded, output, 1e-3) diff --git a/docs/transformers/tests/models/clip/test_modeling_tf_clip.py b/docs/transformers/tests/models/clip/test_modeling_tf_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..27db72e3970657ae1adbe7371f2a1b1788334d67 --- /dev/null +++ b/docs/transformers/tests/models/clip/test_modeling_tf_clip.py @@ -0,0 +1,662 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the TensorFlow CLIP model.""" + +from __future__ import annotations + +import inspect +import os +import tempfile +import unittest +from importlib import import_module + +import requests + +from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig +from transformers.testing_utils import require_tf, require_vision, slow +from transformers.utils import is_tf_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import TFCLIPModel, TFCLIPTextModel, TFCLIPVisionModel, TFSharedEmbeddings + from transformers.modeling_tf_utils import keras + + +if is_vision_available(): + from PIL import Image + + from transformers import CLIPProcessor + + +class TFCLIPVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return CLIPVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = TFCLIPVisionModel(config=config) + result = model(pixel_values, training=False) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (TFCLIPVisionModel,) if is_tf_available() else () + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFCLIPVisionModelTester(self) + self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_inputs_embeds(self): + # CLIP does not use inputs_embeds + pass + + def test_graph_mode_with_inputs_embeds(self): + # CLIP does not use inputs_embeds + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, keras.layers.Layer)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token) + image_size = (self.model_tester.image_size, self.model_tester.image_size) + patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_len = num_patches + 1 + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + attentions = outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + attentions = outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # CLIP has a different seq_length + image_size = (self.model_tester.image_size, self.model_tester.image_size) + patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_length = num_patches + 1 + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + @slow + def test_model_from_pretrained(self): + model_name = "openai/clip-vit-base-patch32" + model = TFCLIPVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @slow + def test_saved_model_creation_extended(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + if hasattr(config, "use_cache"): + config.use_cache = True + + # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token) + image_size = (self.model_tester.image_size, self.model_tester.image_size) + patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_len = num_patches + 1 + + for model_class in self.all_model_classes: + class_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + num_out = len(model(class_inputs_dict)) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, saved_model=True) + saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") + model = keras.models.load_model(saved_model_dir) + outputs = model(class_inputs_dict) + output_hidden_states = outputs["hidden_states"] + output_attentions = outputs["attentions"] + + # Check num outputs + self.assertEqual(len(outputs), num_out) + + # Check num layers + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + + self.assertEqual(len(output_hidden_states), expected_num_layers) + self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers) + + # Check attention outputs + image_size = (self.model_tester.image_size, self.model_tester.image_size) + patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_len = num_patches + 1 + + self.assertListEqual( + list(output_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + + # Check hidden states + self.assertListEqual( + list(output_hidden_states[0].shape[-2:]), + [seq_len, self.model_tester.hidden_size], + ) + + +class TFCLIPTextModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + # make sure the first token has attention mask `1` to ensure that, after combining the causal mask, there + # is still at least one token being attended to for each batch. + # TODO: Change `random_attention_mask` in PT/TF/Flax common test file, after a discussion with the team. + input_mask = tf.concat( + [tf.ones_like(input_mask[:, :1], dtype=input_mask.dtype), input_mask[:, 1:]], axis=-1 + ) + + config = self.get_config() + + return config, input_ids, input_mask + + def get_config(self): + return CLIPTextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, input_ids, input_mask): + model = TFCLIPTextModel(config=config) + result = model(input_ids, attention_mask=input_mask, training=False) + result = model(input_ids, training=False) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, input_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase): + all_model_classes = (TFCLIPTextModel,) if is_tf_available() else () + test_pruning = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFCLIPTextModelTester(self) + self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_inputs_embeds(self): + # CLIP does not use inputs_embeds + pass + + @slow + def test_model_from_pretrained(self): + model_name = "openai/clip-vit-base-patch32" + model = TFCLIPTextModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @slow + def test_saved_model_creation_extended(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + if hasattr(config, "use_cache"): + config.use_cache = True + + for model_class in self.all_model_classes: + class_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + num_out = len(model(class_inputs_dict)) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, saved_model=True) + saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") + model = keras.models.load_model(saved_model_dir) + outputs = model(class_inputs_dict) + output_hidden_states = outputs["hidden_states"] + output_attentions = outputs["attentions"] + + # Check number of outputs + self.assertEqual(len(outputs), num_out) + + # Check number of layers + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + + # Check hidden states + self.assertEqual(len(output_hidden_states), expected_num_layers) + self.assertListEqual( + list(output_hidden_states[0].shape[-2:]), + [self.model_tester.seq_length, self.model_tester.hidden_size], + ) + + # Check attention outputs + self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers) + + seq_length = self.model_tester.seq_length + key_length = getattr(self.model_tester, "key_length", seq_length) + + self.assertListEqual( + list(output_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_length, key_length], + ) + + +class TFCLIPModelTester: + def __init__(self, parent, is_training=True): + self.parent = parent + self.text_model_tester = TFCLIPTextModelTester(parent) + self.vision_model_tester = TFCLIPVisionModelTester(parent) + self.is_training = is_training + + def prepare_config_and_inputs(self): + text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, attention_mask, pixel_values + + def get_config(self): + return CLIPConfig.from_text_vision_configs( + self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64 + ) + + def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): + model = TFCLIPModel(config) + result = model(input_ids, pixel_values, attention_mask, training=False) + self.parent.assertEqual( + result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size) + ) + self.parent.assertEqual( + result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "return_loss": True, + } + return config, inputs_dict + + +@require_tf +class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (TFCLIPModel,) if is_tf_available() else () + pipeline_model_mapping = {"feature-extraction": TFCLIPModel} if is_tf_available() else {} + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + test_onnx = False + + def setUp(self): + self.model_tester = TFCLIPModelTester(self) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + # hidden_states are tested in individual model tests + def test_hidden_states_output(self): + pass + + # input_embeds are tested in individual model tests + def test_inputs_embeds(self): + pass + + # CLIPModel does not have input/output embeddings + def test_model_common_attributes(self): + pass + + # overwrite from common since `TFCLIPModelTester` set `return_loss` to `True` and causes the preparation of + # `symbolic_inputs` failed. + def test_keras_save_load(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # remove `return_loss` to make code work + if self.__class__.__name__ == "TFCLIPModelTest": + inputs_dict.pop("return_loss", None) + + tf_main_layer_classes = { + module_member + for model_class in self.all_model_classes + for module in (import_module(model_class.__module__),) + for module_member_name in dir(module) + if module_member_name.endswith("MainLayer") + # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`. + and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] + for module_member in (getattr(module, module_member_name),) + if isinstance(module_member, type) + and keras.layers.Layer in module_member.__bases__ + and getattr(module_member, "_keras_serializable", False) + } + for main_layer_class in tf_main_layer_classes: + # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter + if "T5" in main_layer_class.__name__: + # Take the same values than in TFT5ModelTester for this shared layer + shared = TFSharedEmbeddings(99, 32, name="shared") + config.use_cache = inputs_dict.pop("use_cache", None) + main_layer = main_layer_class(config, embed_tokens=shared) + else: + main_layer = main_layer_class(config) + + symbolic_inputs = { + name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() + } + + model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) + outputs = model(inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + filepath = os.path.join(tmpdirname, "keras_model.h5") + model.save(filepath) + if "T5" in main_layer_class.__name__: + model = keras.models.load_model( + filepath, + custom_objects={ + main_layer_class.__name__: main_layer_class, + "TFSharedEmbeddings": TFSharedEmbeddings, + }, + ) + else: + model = keras.models.load_model( + filepath, custom_objects={main_layer_class.__name__: main_layer_class} + ) + assert isinstance(model, keras.Model) + after_outputs = model(inputs_dict) + self.assert_outputs_same(after_outputs, outputs) + + @slow + def test_model_from_pretrained(self): + model_name = "openai/clip-vit-base-patch32" + model = TFCLIPModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.") + @slow + def test_saved_model_creation(self): + pass + + @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.") + @slow + def test_saved_model_creation_extended(self): + pass + + @unittest.skip(reason="`saved_model` doesn't work with nested outputs so no preparation happens.") + @slow + def test_prepare_serving_output(self): + pass + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@require_vision +@require_tf +class TFCLIPModelIntegrationTest(unittest.TestCase): + @slow + def test_inference(self): + model_name = "openai/clip-vit-base-patch32" + model = TFCLIPModel.from_pretrained(model_name) + processor = CLIPProcessor.from_pretrained(model_name) + + image = prepare_img() + inputs = processor( + text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="tf" + ) + + outputs = model(**inputs, training=False) + + # verify the logits + self.assertEqual( + outputs.logits_per_image.shape, + tf.TensorShape((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), + ) + self.assertEqual( + outputs.logits_per_text.shape, + tf.TensorShape((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), + ) + + expected_logits = tf.constant([[24.5701, 19.3049]]) + + tf.debugging.assert_near(outputs.logits_per_image, expected_logits, atol=1e-3) diff --git a/docs/transformers/tests/models/clip/test_processor_clip.py b/docs/transformers/tests/models/clip/test_processor_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..1b6eed7534885034a99ceba28c1e50d1bc1b31d7 --- /dev/null +++ b/docs/transformers/tests/models/clip/test_processor_clip.py @@ -0,0 +1,199 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import tempfile +import unittest + +import pytest + +from transformers import CLIPTokenizer, CLIPTokenizerFast +from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES +from transformers.testing_utils import require_vision +from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import CLIPImageProcessor, CLIPProcessor + + +@require_vision +class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = CLIPProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l", "w", "r", "t", "low", "er", "lowest", "newer", "wider", "", "<|startoftext|>", "<|endoftext|>"] # fmt: skip + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["#version: 0.2", "l o", "lo w", "e r", ""] + cls.special_tokens_map = {"unk_token": ""} + + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + with open(cls.merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) + + image_processor_map = { + "do_resize": True, + "size": 20, + "do_center_crop": True, + "crop_size": 18, + "do_normalize": True, + "image_mean": [0.48145466, 0.4578275, 0.40821073], + "image_std": [0.26862954, 0.26130258, 0.27577711], + } + cls.image_processor_file = os.path.join(cls.tmpdirname, IMAGE_PROCESSOR_NAME) + with open(cls.image_processor_file, "w", encoding="utf-8") as fp: + json.dump(image_processor_map, fp) + + @classmethod + def get_tokenizer(cls, **kwargs): + return CLIPTokenizer.from_pretrained(cls.tmpdirname, **kwargs) + + @classmethod + def get_rust_tokenizer(cls, **kwargs): + return CLIPTokenizerFast.from_pretrained(cls.tmpdirname, **kwargs) + + @classmethod + def get_image_processor(cls, **kwargs): + return CLIPImageProcessor.from_pretrained(cls.tmpdirname, **kwargs) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname) + + def test_save_load_pretrained_default(self): + tokenizer_slow = self.get_tokenizer() + tokenizer_fast = self.get_rust_tokenizer() + image_processor = self.get_image_processor() + + with tempfile.TemporaryDirectory() as tmpdir: + processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) + processor_slow.save_pretrained(tmpdir) + processor_slow = CLIPProcessor.from_pretrained(tmpdir, use_fast=False) + + processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast.save_pretrained(tmpdir) + processor_fast = CLIPProcessor.from_pretrained(tmpdir) + + self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) + self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) + self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) + self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer) + self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast) + + self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertIsInstance(processor_slow.image_processor, CLIPImageProcessor) + self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessor) + + def test_save_load_pretrained_additional_features(self): + with tempfile.TemporaryDirectory() as tmpdir: + processor = CLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) + processor.save_pretrained(tmpdir) + + tokenizer_add_kwargs = CLIPTokenizer.from_pretrained(tmpdir, bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = CLIPImageProcessor.from_pretrained( + tmpdir, do_normalize=False, padding_value=1.0 + ) + + processor = CLIPProcessor.from_pretrained( + tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, CLIPImageProcessor) + + def test_image_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) + + image_input = self.prepare_image_inputs() + + input_image_proc = image_processor(image_input, return_tensors="np") + input_processor = processor(images=image_input, return_tensors="np") + + for key in input_image_proc.keys(): + self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) + + def test_tokenizer(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + + encoded_processor = processor(text=input_str) + + encoded_tok = tokenizer(input_str) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + def test_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"]) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + def test_tokenizer_decode(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + def test_model_input_names(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), processor.model_input_names) diff --git a/docs/transformers/tests/models/clip/test_tokenization_clip.py b/docs/transformers/tests/models/clip/test_tokenization_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..fca53f07cd3ecffef890754232a3103dc2bb3126 --- /dev/null +++ b/docs/transformers/tests/models/clip/test_tokenization_clip.py @@ -0,0 +1,192 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import unittest +from functools import lru_cache + +from transformers import CLIPTokenizer, CLIPTokenizerFast +from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES +from transformers.testing_utils import require_ftfy, require_tokenizers + +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible + + +@require_tokenizers +class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openai/clip-vit-base-patch32" + tokenizer_class = CLIPTokenizer + rust_tokenizer_class = CLIPTokenizerFast + test_rust_tokenizer = True + from_pretrained_kwargs = {} + test_seq2seq = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + + vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l", "w", "r", "t", "low", "er", "lowest", "newer", "wider", "", "<|startoftext|>", "<|endoftext|>"] # fmt: skip + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["#version: 0.2", "l o", "lo w", "e r"] + cls.special_tokens_map = {"unk_token": ""} + + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + with open(cls.merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return CLIPTokenizer.from_pretrained(pretrained_name, **kwargs) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return CLIPTokenizerFast.from_pretrained(pretrained_name, **kwargs) + + def get_input_output_texts(self, tokenizer): + input_text = "lower newer" + output_text = "lower newer" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) + text = "lower newer" + bpe_tokens = ["lo", "w", "er", "n", "e", "w", "er"] + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [tokenizer.unk_token] + input_bpe_tokens = [10, 2, 16, 9, 3, 2, 16, 20] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + @require_ftfy + def test_check_encoding_slow_fast(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_s = self.get_tokenizer(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + + text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d." + text_tokenized_s = tokenizer_s.tokenize(text) + text_tokenized_r = tokenizer_r.tokenize(text) + + self.assertListEqual(text_tokenized_s, text_tokenized_r) + + # Test that the tokenization is identical on an example containing a character (Latin Small Letter A + # with Tilde) encoded in 2 different ways + text = "xa\u0303y" + " " + "x\xe3y" + text_tokenized_s = tokenizer_s.tokenize(text) + text_tokenized_r = tokenizer_r.tokenize(text) + + self.assertListEqual(text_tokenized_s, text_tokenized_r) + + # Test that the tokenization is identical on unicode of space type + spaces_unicodes = [ + "\u0009", # (horizontal tab, '\t') + "\u000b", # (vertical tab) + "\u000c", # (form feed) + "\u0020", # (space, ' ') + "\u200e", # (left-to-right mark):w + "\u200f", # (right-to-left mark) + ] + for unicode_seq in spaces_unicodes: + text_tokenized_s = tokenizer_s.tokenize(unicode_seq) + text_tokenized_r = tokenizer_r.tokenize(unicode_seq) + + self.assertListEqual(text_tokenized_s, text_tokenized_r) + + # Test that the tokenization is identical on unicode of line break type + line_break_unicodes = [ + "\u000a", # (line feed, '\n') + "\r\n", # (carriage return and line feed, '\r\n') + "\u000d", # (carriage return, '\r') + "\r", # (carriage return, '\r') + "\u000d", # (carriage return, '\r') + "\u2028", # (line separator) + "\u2029", # (paragraph separator) + # "\u0085", # (next line) + ] + + # The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms + # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a + # space (and thus into an empty list). + + for unicode_seq in line_break_unicodes: + text_tokenized_s = tokenizer_s.tokenize(unicode_seq) + text_tokenized_r = tokenizer_r.tokenize(unicode_seq) + + self.assertListEqual(text_tokenized_s, text_tokenized_r) + + def test_offsets_mapping_with_different_add_prefix_space_argument(self): + # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space` + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name` + text = f"{text_of_1_token} {text_of_1_token}" + + tokenizer_r = self.get_rust_tokenizer( + pretrained_name, + use_fast=True, + ) + encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token))) + self.assertEqual( + encoding.offset_mapping[1], + (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), + ) + + text = f" {text}" + + tokenizer_r = self.get_rust_tokenizer( + pretrained_name, + use_fast=True, + ) + encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token))) + self.assertEqual( + encoding.offset_mapping[1], + (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), + ) + + def test_log_warning(self): + # Test related to the breaking change introduced in transformers v4.17.0 + # We need to check that an error in raised when the user try to load a previous version of the tokenizer. + with self.assertRaises(ValueError) as context: + self.get_rust_tokenizer("robot-test/old-clip-tokenizer") + + self.assertTrue( + context.exception.args[0].startswith( + "The `backend_tokenizer` provided does not match the expected format." + ) + ) + + @require_ftfy + def test_tokenization_python_rust_equals(self): + super().test_tokenization_python_rust_equals() + + @unittest.skip(reason="CLIP always lower cases letters") + def test_added_tokens_do_lower_case(self): + pass diff --git a/docs/transformers/tests/models/clipseg/__init__.py b/docs/transformers/tests/models/clipseg/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/clipseg/test_modeling_clipseg.py b/docs/transformers/tests/models/clipseg/test_modeling_clipseg.py new file mode 100644 index 0000000000000000000000000000000000000000..0a0bc5a758230a20e750c429e99e73445cd05d41 --- /dev/null +++ b/docs/transformers/tests/models/clipseg/test_modeling_clipseg.py @@ -0,0 +1,714 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch CLIPSeg model.""" + +import inspect +import os +import tempfile +import unittest + +import numpy as np +import requests + +from transformers import CLIPSegConfig, CLIPSegProcessor, CLIPSegTextConfig, CLIPSegVisionConfig +from transformers.testing_utils import ( + require_torch, + require_vision, + slow, + torch_device, +) +from transformers.utils import is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + _config_zero_init, + floats_tensor, + ids_tensor, + random_attention_mask, +) +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import CLIPSegForImageSegmentation, CLIPSegModel, CLIPSegTextModel, CLIPSegVisionModel + from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES + + +if is_vision_available(): + from PIL import Image + + +class CLIPSegVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return CLIPSegVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = CLIPSegVisionModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class CLIPSegVisionModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as CLIPSeg does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (CLIPSegVisionModel,) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = CLIPSegVisionModelTester(self) + self.config_tester = ConfigTester( + self, config_class=CLIPSegVisionConfig, has_text_modality=False, hidden_size=37 + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="CLIPSeg does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "CIDAS/clipseg-rd64-refined" + model = CLIPSegVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class CLIPSegTextModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + config = self.get_config() + + return config, input_ids, input_mask + + def get_config(self): + return CLIPSegTextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, input_ids, input_mask): + model = CLIPSegTextModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, input_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (CLIPSegTextModel,) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_head_masking = False + model_split_percents = [0.5, 0.8, 0.9] + + def setUp(self): + self.model_tester = CLIPSegTextModelTester(self) + self.config_tester = ConfigTester(self, config_class=CLIPSegTextConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="CLIPSeg does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "CIDAS/clipseg-rd64-refined" + model = CLIPSegTextModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class CLIPSegModelTester: + def __init__( + self, + parent, + text_kwargs=None, + vision_kwargs=None, + is_training=True, + # This should respect the `num_hidden_layers` in `CLIPSegVisionModelTester` + extract_layers=(1,), + ): + if text_kwargs is None: + text_kwargs = {} + if vision_kwargs is None: + vision_kwargs = {} + + self.parent = parent + self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs) + self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test + self.is_training = is_training + self.extract_layers = extract_layers + + def prepare_config_and_inputs(self): + text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, attention_mask, pixel_values + + def get_config(self): + return CLIPSegConfig.from_text_vision_configs( + self.text_model_tester.get_config(), + self.vision_model_tester.get_config(), + projection_dim=64, + reduce_dim=32, + extract_layers=self.extract_layers, + ) + + def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): + model = CLIPSegModel(config).to(torch_device).eval() + with torch.no_grad(): + result = model(input_ids, pixel_values, attention_mask) + self.parent.assertEqual( + result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size) + ) + self.parent.assertEqual( + result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size) + ) + + def create_and_check_model_for_image_segmentation(self, config, input_ids, attention_maks, pixel_values): + model = CLIPSegForImageSegmentation(config).to(torch_device).eval() + with torch.no_grad(): + result = model(input_ids, pixel_values) + self.parent.assertEqual( + result.logits.shape, + ( + self.vision_model_tester.batch_size, + self.vision_model_tester.image_size, + self.vision_model_tester.image_size, + ), + ) + self.parent.assertEqual( + result.conditional_embeddings.shape, (self.text_model_tester.batch_size, config.projection_dim) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + } + return config, inputs_dict + + +@require_torch +class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (CLIPSegModel, CLIPSegForImageSegmentation) if is_torch_available() else () + pipeline_model_mapping = {"feature-extraction": CLIPSegModel} if is_torch_available() else {} + fx_compatible = False + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + # CLIPSegForImageSegmentation requires special treatment + if return_labels: + if model_class.__name__ == "CLIPSegForImageSegmentation": + batch_size, _, height, width = inputs_dict["pixel_values"].shape + inputs_dict["labels"] = torch.zeros( + [batch_size, height, width], device=torch_device, dtype=torch.float + ) + + return inputs_dict + + def setUp(self): + self.model_tester = CLIPSegModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=CLIPSegConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model_for_image_segmentation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs) + + @unittest.skip(reason="Hidden_states is tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="Inputs_embeds is tested in individual model tests") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="CLIPSegModel does not have input/output embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + # override as the some parameters require custom initialization + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + # check if `logit_scale` is initialized as per the original implementation + if "logit_scale" in name: + self.assertAlmostEqual( + param.data.item(), + np.log(1 / 0.07), + delta=1e-3, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + elif "film" in name or "transposed_conv" in name or "reduce" in name: + # those parameters use PyTorch' default nn.Linear initialization scheme + pass + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def _create_and_check_torchscript(self, config, inputs_dict): + if not self.test_torchscript: + self.skipTest(reason="test_torchscript is set to False") + + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.torchscript = True + configs_no_init.return_dict = False + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + + try: + input_ids = inputs_dict["input_ids"] + pixel_values = inputs_dict["pixel_values"] # CLIPSeg needs pixel_values + traced_model = torch.jit.trace(model, (input_ids, pixel_values)) + except RuntimeError: + self.fail("Couldn't trace module.") + + with tempfile.TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") + + try: + torch.jit.save(traced_model, pt_file_name) + except Exception: + self.fail("Couldn't save module.") + + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load module.") + + model.to(torch_device) + model.eval() + + loaded_model.to(torch_device) + loaded_model.eval() + + model_state_dict = model.state_dict() + loaded_model_state_dict = loaded_model.state_dict() + + non_persistent_buffers = {} + for key in loaded_model_state_dict.keys(): + if key not in model_state_dict.keys(): + non_persistent_buffers[key] = loaded_model_state_dict[key] + + loaded_model_state_dict = { + key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers + } + + self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) + + model_buffers = list(model.buffers()) + for non_persistent_buffer in non_persistent_buffers.values(): + found_buffer = False + for i, model_buffer in enumerate(model_buffers): + if torch.equal(non_persistent_buffer, model_buffer): + found_buffer = True + break + + self.assertTrue(found_buffer) + model_buffers.pop(i) + + models_equal = True + for layer_name, p1 in model_state_dict.items(): + p2 = loaded_model_state_dict[layer_name] + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + def test_load_vision_text_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Save CLIPSegConfig and check if we can load CLIPSegVisionConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + vision_config = CLIPSegVisionConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict()) + + # Save CLIPSegConfig and check if we can load CLIPSegTextConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + text_config = CLIPSegTextConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict()) + + def test_training(self): + if not self.model_tester.is_training: + self.skipTest(reason="Training test is skipped as the model was not trained") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values(): + continue + + print("Model class:", model_class) + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + for k, v in inputs.items(): + print(k, v.shape) + loss = model(**inputs).loss + loss.backward() + + @slow + def test_model_from_pretrained(self): + model_name = "CIDAS/clipseg-rd64-refined" + model = CLIPSegModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + return image + + +@require_vision +@require_torch +class CLIPSegModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_image_segmentation(self): + model_name = "CIDAS/clipseg-rd64-refined" + processor = CLIPSegProcessor.from_pretrained(model_name) + model = CLIPSegForImageSegmentation.from_pretrained(model_name).to(torch_device) + + image = prepare_img() + texts = ["a cat", "a remote", "a blanket"] + inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the predicted masks + self.assertEqual( + outputs.logits.shape, + torch.Size((3, 352, 352)), + ) + expected_masks_slice = torch.tensor( + [[-7.4613, -7.4785, -7.3628], [-7.3268, -7.0899, -7.1333], [-6.9838, -6.7900, -6.8913]] + ).to(torch_device) + + torch.testing.assert_close(outputs.logits[0, :3, :3], expected_masks_slice, rtol=1e-3, atol=1e-3) + + # verify conditional and pooled output + expected_conditional = torch.tensor([0.5601, -0.0314, 0.1980]).to(torch_device) + expected_pooled_output = torch.tensor([0.5036, -0.2681, -0.2644]).to(torch_device) + torch.testing.assert_close(outputs.conditional_embeddings[0, :3], expected_conditional, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(outputs.pooled_output[0, :3], expected_pooled_output, rtol=1e-3, atol=1e-3) + + @slow + def test_inference_interpolate_pos_encoding(self): + # ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = CLIPSegModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device) + + processor = CLIPSegProcessor.from_pretrained( + "openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 26, 768)) + + self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]] + ).to(torch_device) + + torch.testing.assert_close( + outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4 + ) diff --git a/docs/transformers/tests/models/clipseg/test_processor_clipseg.py b/docs/transformers/tests/models/clipseg/test_processor_clipseg.py new file mode 100644 index 0000000000000000000000000000000000000000..5147ed39753913c63b088ccc887ffe08f5bc9331 --- /dev/null +++ b/docs/transformers/tests/models/clipseg/test_processor_clipseg.py @@ -0,0 +1,194 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import tempfile +import unittest + +import pytest + +from transformers import CLIPTokenizer, CLIPTokenizerFast +from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES +from transformers.testing_utils import require_vision +from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import CLIPSegProcessor, ViTImageProcessor + + +@require_vision +class CLIPSegProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = CLIPSegProcessor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + + vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l", "w", "r", "t", "low", "er", "lowest", "newer", "wider", "", "<|startoftext|>", "<|endoftext|>"] # fmt: skip + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["#version: 0.2", "l o", "lo w", "e r", ""] + self.special_tokens_map = {"unk_token": ""} + + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + with open(self.merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) + + image_processor_map = { + "do_resize": True, + "size": 20, + "do_center_crop": True, + "crop_size": 18, + "do_normalize": True, + "image_mean": [0.48145466, 0.4578275, 0.40821073], + "image_std": [0.26862954, 0.26130258, 0.27577711], + } + self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME) + with open(self.image_processor_file, "w", encoding="utf-8") as fp: + json.dump(image_processor_map, fp) + + def get_tokenizer(self, **kwargs): + return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs) + + def get_rust_tokenizer(self, **kwargs): + return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + + def get_image_processor(self, **kwargs): + return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs) + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def test_save_load_pretrained_default(self): + tokenizer_slow = self.get_tokenizer() + tokenizer_fast = self.get_rust_tokenizer() + image_processor = self.get_image_processor() + + processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) + processor_slow.save_pretrained(self.tmpdirname) + processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False) + + processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast.save_pretrained(self.tmpdirname) + processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname) + + self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) + self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) + self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) + self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer) + self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast) + + self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertIsInstance(processor_slow.image_processor, ViTImageProcessor) + self.assertIsInstance(processor_fast.image_processor, ViTImageProcessor) + + def test_save_load_pretrained_additional_features(self): + processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) + processor.save_pretrained(self.tmpdirname) + + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + + processor = CLIPSegProcessor.from_pretrained( + self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, ViTImageProcessor) + + def test_image_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor) + + image_input = self.prepare_image_inputs() + + input_feat_extract = image_processor(image_input, return_tensors="np") + input_processor = processor(images=image_input, return_tensors="np") + + for key in input_feat_extract.keys(): + self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) + + def test_tokenizer(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + + encoded_processor = processor(text=input_str) + + encoded_tok = tokenizer(input_str) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + def test_processor_text(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"]) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + def test_processor_visual_prompt(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor) + + image_input = self.prepare_image_inputs() + visual_prompt_input = self.prepare_image_inputs() + + inputs = processor(images=image_input, visual_prompt=visual_prompt_input) + + self.assertListEqual(list(inputs.keys()), ["pixel_values", "conditional_pixel_values"]) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + def test_tokenizer_decode(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) diff --git a/docs/transformers/tests/models/clvp/__init__.py b/docs/transformers/tests/models/clvp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/clvp/test_feature_extraction_clvp.py b/docs/transformers/tests/models/clvp/test_feature_extraction_clvp.py new file mode 100644 index 0000000000000000000000000000000000000000..78bf38e2df5a865583f0937edfe481f0b7f5c248 --- /dev/null +++ b/docs/transformers/tests/models/clvp/test_feature_extraction_clvp.py @@ -0,0 +1,240 @@ +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import os +import random +import tempfile +import unittest + +import numpy as np +from datasets import Audio, load_dataset + +from transformers import ClvpFeatureExtractor +from transformers.testing_utils import ( + check_json_file_has_correct_format, + cleanup, + require_torch, + slow, + torch_device, +) +from transformers.utils.import_utils import is_torch_available + +from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin + + +if is_torch_available(): + import torch + +global_rng = random.Random() + + +# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.floats_list +def floats_list(shape, scale=1.0, rng=None, name=None): + """Creates a random float32 tensor""" + if rng is None: + rng = global_rng + + values = [] + for batch_idx in range(shape[0]): + values.append([]) + for _ in range(shape[1]): + values[-1].append(rng.random() * scale) + + return values + + +@require_torch +class ClvpFeatureExtractionTester: + def __init__( + self, + parent, + batch_size=7, + min_seq_length=400, + max_seq_length=2000, + feature_size=10, + hop_length=160, + chunk_length=8, + padding_value=0.0, + sampling_rate=4_000, + return_attention_mask=False, + ): + self.parent = parent + self.batch_size = batch_size + self.min_seq_length = min_seq_length + self.max_seq_length = max_seq_length + self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1) + self.padding_value = padding_value + self.sampling_rate = sampling_rate + self.return_attention_mask = return_attention_mask + self.feature_size = feature_size + self.chunk_length = chunk_length + self.hop_length = hop_length + + def prepare_feat_extract_dict(self): + return { + "feature_size": self.feature_size, + "hop_length": self.hop_length, + "chunk_length": self.chunk_length, + "padding_value": self.padding_value, + "sampling_rate": self.sampling_rate, + "return_attention_mask": self.return_attention_mask, + } + + # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester.prepare_inputs_for_common + def prepare_inputs_for_common(self, equal_length=False, numpify=False): + def _flatten(list_of_lists): + return list(itertools.chain(*list_of_lists)) + + if equal_length: + speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)] + else: + # make sure that inputs increase in size + speech_inputs = [ + floats_list((x, self.feature_size)) + for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff) + ] + if numpify: + speech_inputs = [np.asarray(x) for x in speech_inputs] + return speech_inputs + + +@require_torch +class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): + feature_extraction_class = ClvpFeatureExtractor + + def setUp(self): + self.feat_extract_tester = ClvpFeatureExtractionTester(self) + + def tearDown(self): + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device) + + # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_from_and_save_pretrained + def test_feat_extract_from_and_save_pretrained(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + saved_file = feat_extract_first.save_pretrained(tmpdirname)[0] + check_json_file_has_correct_format(saved_file) + feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + mel_1 = feat_extract_first.mel_filters + mel_2 = feat_extract_second.mel_filters + self.assertTrue(np.allclose(mel_1, mel_2)) + self.assertEqual(dict_first, dict_second) + + # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_to_json_file + def test_feat_extract_to_json_file(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "feat_extract.json") + feat_extract_first.to_json_file(json_file_path) + feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) + + dict_first = feat_extract_first.to_dict() + dict_second = feat_extract_second.to_dict() + mel_1 = feat_extract_first.mel_filters + mel_2 = feat_extract_second.mel_filters + self.assertTrue(np.allclose(mel_1, mel_2)) + self.assertEqual(dict_first, dict_second) + + def test_call(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + # create three inputs of length 800, 1000, and 1200 + speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] + + # Test feature size + input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features + self.assertTrue(input_features.ndim == 3) + self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size) + + # Test not batched input + encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features + encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features + self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) + + # Test batched + encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features + encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + # Test 2-D numpy arrays are batched. + speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)] + np_speech_inputs = np.asarray(speech_inputs) + encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features + encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + # Test truncation required + speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)] + np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] + + speech_inputs_truncated = [x[: feature_extractor.n_samples] for x in speech_inputs] + np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated] + + encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features + encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad + def test_double_precision_pad(self): + import torch + + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + np_speech_inputs = np.random.rand(100, 32).astype(np.float64) + py_speech_inputs = np_speech_inputs.tolist() + + for inputs in [py_speech_inputs, np_speech_inputs]: + np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np") + self.assertTrue(np_processed.input_features.dtype == np.float32) + pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt") + self.assertTrue(pt_processed.input_features.dtype == torch.float32) + + def _load_datasamples(self, num_samples): + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = ds.cast_column("audio", Audio(sampling_rate=22050)) + # automatic decoding with librispeech + speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + + return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples] + + @slow + def test_integration(self): + # fmt: off + EXPECTED_INPUT_FEATURES = torch.tensor( + [ + 0.9271, 1.1405, 1.4419, 1.2470, 1.2438, 1.1787, 1.0595, 1.0570, 1.1070, + 1.2205, 1.2376, 1.2997, 1.1131, 1.0843, 1.0459, 1.1858, 1.2323, 1.3582, + 1.3401, 1.3770, 1.4173, 1.3381, 1.2291, 1.0854, 1.2116, 1.1873, 1.2178, + 1.2137, 1.3001, 1.4274 + ] + ) + # fmt: on + + input_speech, sr = self._load_datasamples(1) + + feature_extractor = ClvpFeatureExtractor.from_pretrained("susnato/clvp_dev") + input_features = feature_extractor(input_speech, sampling_rate=sr[0], return_tensors="pt").input_features + self.assertEqual(input_features.shape, (1, 80, 517)) + torch.testing.assert_close(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/clvp/test_modeling_clvp.py b/docs/transformers/tests/models/clvp/test_modeling_clvp.py new file mode 100644 index 0000000000000000000000000000000000000000..60c165fbbe8d7887113cbfd067309917841ef99e --- /dev/null +++ b/docs/transformers/tests/models/clvp/test_modeling_clvp.py @@ -0,0 +1,640 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Clvp model.""" + +import tempfile +import unittest + +import datasets +import numpy as np + +from transformers import ClvpConfig, ClvpDecoderConfig, ClvpEncoderConfig +from transformers.testing_utils import ( + cleanup, + require_torch, + slow, + torch_device, +) +from transformers.utils import is_torch_available + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + _config_zero_init, + ids_tensor, + random_attention_mask, +) +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ClvpEncoder, ClvpForCausalLM, ClvpModel, ClvpModelForConditionalGeneration + +from transformers import ClvpFeatureExtractor, ClvpTokenizer + + +class ClvpEncoderTester: + def __init__( + self, + parent, + batch_size=2, + seq_length=7, + is_training=False, + use_input_mask=True, + use_labels=True, + vocab_size=50, + hidden_size=128, + projection_dim=16, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=32, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + + def get_config(self): + encoder_config = ClvpEncoderConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + ) + + return encoder_config + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + encoder_config = self.get_config() + + return encoder_config, input_ids, input_mask + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + speech_config, input_ids, input_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids.to(torch_device), "attention_mask": input_mask.to(torch_device)} + return speech_config, inputs_dict + + def create_and_check_model(self, speech_config, input_ids, input_mask): + text_config = ClvpEncoderConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + text_encoder_model = ClvpEncoder(config=text_config) + text_encoder_model.to(torch_device) + text_encoder_model.eval() + with torch.no_grad(): + result = text_encoder_model(input_ids, attention_mask=input_mask) + result = text_encoder_model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim)) + + # now check with speech config + speech_encoder_model = ClvpEncoder(config=speech_config) + speech_encoder_model.to(torch_device) + speech_encoder_model.eval() + with torch.no_grad(): + result = speech_encoder_model(input_ids, attention_mask=input_mask) + result = speech_encoder_model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim)) + + +@require_torch +class ClvpEncoderTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (ClvpEncoder,) if is_torch_available() else () + test_pruning = False + test_head_masking = False + test_torchscript = False + + def setUp(self): + self.model_tester = ClvpEncoderTester(self) + self.encoder_config_tester = ConfigTester(self, config_class=ClvpEncoderConfig, hidden_size=32) + + def tearDown(self): + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device) + + def test_config(self): + self.encoder_config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="ClvpEncoder does not output loss") + def test_training(self): + pass + + @unittest.skip(reason="ClvpEncoder does not output loss") + def test_training_gradient_checkpointing(self): + pass + + +class ClvpDecoderTester: + def __init__( + self, + parent, + batch_size=2, + seq_length=3, + is_training=False, + vocab_size=300, + max_position_embeddings=256, + max_text_tokens=256, + use_input_mask=True, + hidden_size=128, + num_hidden_layers=2, + num_attention_heads=2, + bos_token_id=97, + eos_token_id=98, + relative_attention_num_buckets=4, + relative_attention_max_distance=16, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.max_text_tokens = max_text_tokens + self.use_input_mask = use_input_mask + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.num_hidden_layers = num_hidden_layers + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.relative_attention_num_buckets = relative_attention_num_buckets + self.relative_attention_max_distance = relative_attention_max_distance + + def get_config(self): + decoder_config = ClvpDecoderConfig( + vocab_size=self.vocab_size, + max_position_embeddings=self.max_position_embeddings, + max_text_tokens=self.max_text_tokens, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + relative_attention_num_buckets=self.relative_attention_num_buckets, + relative_attention_max_distance=self.relative_attention_max_distance, + ) + + return decoder_config + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + decoder_config = self.get_config() + + return decoder_config, input_ids, input_mask + + def create_and_check_model(self, config, input_ids, attention_mask): + model = ClvpForCausalLM(config).to(torch_device).eval() + with torch.no_grad(): + result = model(input_ids=input_ids, attention_mask=attention_mask) + + self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask = config_and_inputs + inputs_dict = { + "input_ids": input_ids.to(torch_device), + "attention_mask": attention_mask.to(torch_device), + } + return config, inputs_dict + + +@require_torch +class ClvpDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (ClvpModel, ClvpForCausalLM) if is_torch_available() else () + pipeline_model_mapping = {"feature-extraction": ClvpModelForConditionalGeneration} if is_torch_available() else {} + + test_pruning = False + + def setUp(self): + self.model_tester = ClvpDecoderTester(self) + self.decoder_config_tester = ConfigTester(self, config_class=ClvpDecoderConfig, hidden_size=32) + + def tearDown(self): + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + if return_labels and model_class == ClvpForCausalLM: + inputs_dict["labels"] = torch.zeros( + [self.model_tester.batch_size, self.model_tester.seq_length], device=torch_device + ).long() + + return inputs_dict + + def test_training(self): + # we will only test the ClvpForCausalLM since it outputs loss + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + model = ClvpForCausalLM(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + # we will only test the ClvpForCausalLM since it outputs loss + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + + model = ClvpForCausalLM(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True) + + loss = model(**inputs).loss + loss.backward() + + @unittest.skip(reason="Clvp `prepare_inputs_for_generation` function doesn't have cache position.") + def test_generate_continue_from_inputs_embeds(self): + pass + + +class ClvpModelForConditionalGenerationTester: + def __init__(self, parent, is_training=False): + self.parent = parent + self.clvp_encoder_tester = ClvpEncoderTester(parent) + self.is_training = is_training + self.batch_size = self.clvp_encoder_tester.batch_size # need bs for batching_equivalence test + + def get_config(self): + decoder_config = ClvpDecoderConfig( + vocab_size=50, + max_position_embeddings=30, + max_text_tokens=30, + hidden_size=128, + num_hidden_layers=1, + num_attention_heads=2, + bos_token_id=97, + eos_token_id=98, + relative_attention_num_buckets=4, + relative_attention_max_distance=16, + ) + text_config = self.clvp_encoder_tester.get_config() + speech_config = self.clvp_encoder_tester.get_config() + speech_config.vocab_size = 300 + + return ClvpConfig.from_sub_model_configs( + text_config, + speech_config, + decoder_config, + projection_dim=16, + ) + + def prepare_config_and_inputs(self): + _, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs() + + ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) + _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() + + feature_extractor = ClvpFeatureExtractor() + input_features = feature_extractor(raw_speech=audio, sampling_rate=sr, return_tensors="pt")[ + "input_features" + ].to(torch_device) + + config = self.get_config() + + return config, input_ids, attention_mask, input_features + + def create_and_check_model(self, config, input_ids, attention_mask, input_features): + model = ClvpModelForConditionalGeneration(config).to(torch_device).eval() + with torch.no_grad(): + result = model(input_ids=input_ids, input_features=input_features, attention_mask=attention_mask) + + self.parent.assertEqual(result.logits_per_speech.shape, (2, self.clvp_encoder_tester.batch_size)) + self.parent.assertEqual(result.logits_per_text.shape, (self.clvp_encoder_tester.batch_size, 2)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, input_features = config_and_inputs + inputs_dict = { + "input_ids": input_ids.to(torch_device), + "attention_mask": attention_mask.to(torch_device), + "input_features": input_features.to(torch_device), + "return_loss": False, + } + return config, inputs_dict + + +@require_torch +class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (ClvpModelForConditionalGeneration,) if is_torch_available() else () + # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante + all_generative_model_classes = () + + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + test_torchscript = False + + def setUp(self): + self.model_tester = ClvpModelForConditionalGenerationTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.clvp_config_tester = ConfigTester( + self, config_class=ClvpConfig, has_text_modality=False, common_properties=common_properties, hidden_size=32 + ) + + def test_config(self): + self.clvp_config_tester.run_common_tests() + + def tearDown(self): + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + # check for decoder model, text encoder model and speech encoder model hidden states + decoder_hidden_states = outputs.decoder_hidden_states + text_encoder_hidden_states = outputs.text_encoder_hidden_states + speech_encoder_hidden_states = outputs.speech_encoder_hidden_states + + # check length of the hidden states + expected_decoder_num_layers = config.decoder_config.num_hidden_layers + 1 + self.assertEqual(len(decoder_hidden_states), expected_decoder_num_layers) + + expected_speech_encoder_num_layers = config.text_config.num_hidden_layers + 1 + self.assertEqual(len(text_encoder_hidden_states), expected_speech_encoder_num_layers) + + expected_text_encoder_num_layers = config.speech_config.num_hidden_layers + 1 + self.assertEqual(len(speech_encoder_hidden_states), expected_text_encoder_num_layers) + + # check shapes of each hidden state + + # for the decoder model we will only test the dimension because the ClvpConditioningEncoder could increase + # the sequence lengths. + self.assertEqual(decoder_hidden_states[0].shape[-1], config.decoder_config.hidden_size) + + # the testing for text encoder stays standard because we just pass the text tokens here. + self.assertListEqual( + list(text_encoder_hidden_states[0].shape[-2:]), + [self.model_tester.clvp_encoder_tester.seq_length, config.text_config.hidden_size], + ) + + # for the decoder model we will only test the dimension because the fix_decoder_outputs method could increase + # the sequence lengths by adding `decoder_fixing_codes` tokens at the end. + self.assertEqual(speech_encoder_hidden_states[0].shape[-1], config.speech_config.hidden_size) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings") + def test_model_get_set_embeddings(self): + pass + + # override as the `logit_scale` parameter initialization is different for Clvp + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + # check if `logit_scale` is initialized as per the original implementation + if name == "logit_scale": + expected_value = np.log(1 / 0.07) + returned_value = param.data.item() + + self.assertAlmostEqual( + returned_value, + expected_value, + delta=1e-3, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + expected_range = [0.0, 1.0] + returned_range = ((param.data.mean() * 1e9).round() / 1e9).item() + + self.assertIn( + returned_range, + expected_range, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_load_speech_text_decoder_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Save ClvpConfig and check if we can load ClvpEncoderConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + encoder_config = ClvpEncoderConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.text_config.to_dict(), encoder_config.to_dict()) + + # Save ClvpConfig and check if we can load ClvpDecoderConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + decoder_config = ClvpDecoderConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.decoder_config.to_dict(), decoder_config.to_dict()) + + @slow + def test_model_from_pretrained(self): + model_name = "susnato/clvp_dev" + model = ClvpModelForConditionalGeneration.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# Since Clvp has a lot of different models connected with each other it's better to test each of them individually along +# with a test_full_model_integration. If the model breaks in future, it could be of a great help to identify the broken part. + + +@slow +@require_torch +class ClvpIntegrationTest(unittest.TestCase): + def setUp(self): + self.text = "This is an example text." + ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) + _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() + + self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device) + self.model.eval() + tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev") + feature_extractor = ClvpFeatureExtractor.from_pretrained("susnato/clvp_dev") + + tokenizer_output = tokenizer(self.text, return_tensors="pt") + self.text_tokens = tokenizer_output["input_ids"].to(torch_device) + self.input_features = feature_extractor( + raw_speech=self.speech_samples, sampling_rate=self.sr, return_tensors="pt" + )["input_features"].to(torch_device) + + def tearDown(self): + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device, gc_collect=True) + + def test_conditional_encoder(self): + with torch.no_grad(): + conditioning_encoder_outputs = self.model.conditioning_encoder( + input_features=self.input_features, input_ids=self.text_tokens + ).to("cpu") + + self.assertEqual( + conditioning_encoder_outputs.shape, + torch.Size((self.input_features.shape[0], 18, self.model.config.decoder_config.hidden_size)), + ) + + EXPECTED_OUTPUTS = torch.tensor( + [[-0.8582, 0.5228, 1.9944], [-0.0465, -1.1017, -0.0093], [-0.0466, -0.6030, -0.1280]] + ) + + torch.testing.assert_close(conditioning_encoder_outputs[0, :3, :3], EXPECTED_OUTPUTS, rtol=1e-4, atol=1e-4) + + def test_decoder_model_generate(self): + autoregressive_model_output = self.model.speech_decoder_model.generate(input_ids=self.text_tokens).cpu() + + EXPECTED_OUTPUTS = torch.tensor([[147, 2, 54, 2, 43, 2, 169, 122, 29, 64, 2, 136, 37, 33, 9, 8193]]) + + torch.testing.assert_close(autoregressive_model_output, EXPECTED_OUTPUTS) + + def test_text_and_speech_encoder_models(self): + # check for text embeds + text_embeds = self.model.text_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu() + + # fmt: off + EXPECTED_TEXT_EMBEDS = torch.tensor([1.4798, -2.0005, 2.3902, -0.5042, 1.6401, -2.4135, -1.4800, 3.0118, -2.4422, 1.3266, 2.2339, 1.4761, -4.8983, -1.3592, 6.0251, 6.7364, 2.2576, 3.7229, -10.0436, 4.6676]) + # fmt: on + + torch.testing.assert_close(text_embeds[0, :20], EXPECTED_TEXT_EMBEDS, rtol=1e-4, atol=1e-4) + + # check for speech embeds + speech_embeds = self.model.speech_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu() + + # fmt: off + EXPECTED_SPEECH_EMBEDS = torch.tensor([3.1202, -3.1183, -1.4264, -6.1339, 1.8885, -0.1983, 0.9461, -1.7414, 0.3320, -3.8400, -1.5715, 1.5096, -1.7576, 0.2387, 4.9758, 5.8450, -6.2534, 2.8587, -5.5816, 4.7821]) + # fmt: on + + torch.testing.assert_close(speech_embeds[0, :20], EXPECTED_SPEECH_EMBEDS, rtol=1e-4, atol=1e-4) + + def test_full_model_integration(self): + full_model_output = self.model.generate( + input_ids=self.text_tokens, + input_features=self.input_features, + do_sample=False, + num_beams=4, + num_return_sequences=4, + max_new_tokens=10, + ) + + EXPECTED_SPEECH_IDS = torch.tensor([[1953, 1080, 612], [1953, 612, 493], [1953, 612, 716]]) + EXPECTED_SIMILARITY_SCORES = torch.tensor([[14.7660, 14.4569, 13.6472, 13.5683]]) + + torch.testing.assert_close(full_model_output.speech_ids.cpu()[-3:, -3:], EXPECTED_SPEECH_IDS) + torch.testing.assert_close(full_model_output.logits_per_text.cpu(), EXPECTED_SIMILARITY_SCORES) diff --git a/docs/transformers/tests/models/clvp/test_processor_clvp.py b/docs/transformers/tests/models/clvp/test_processor_clvp.py new file mode 100644 index 0000000000000000000000000000000000000000..f751ab92d03d957906f89a06f1eeee0d4b66bc5e --- /dev/null +++ b/docs/transformers/tests/models/clvp/test_processor_clvp.py @@ -0,0 +1,136 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import gc +import shutil +import tempfile +import unittest + +from transformers import ClvpFeatureExtractor, ClvpProcessor, ClvpTokenizer +from transformers.testing_utils import require_torch + +from .test_feature_extraction_clvp import floats_list + + +@require_torch +class ClvpProcessorTest(unittest.TestCase): + def setUp(self): + self.checkpoint = "susnato/clvp_dev" + self.tmpdirname = tempfile.mkdtemp() + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmpdirname) + gc.collect() + + # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.get_tokenizer with Whisper->Clvp + def get_tokenizer(self, **kwargs): + return ClvpTokenizer.from_pretrained(self.checkpoint, **kwargs) + + # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.get_feature_extractor with Whisper->Clvp + def get_feature_extractor(self, **kwargs): + return ClvpFeatureExtractor.from_pretrained(self.checkpoint, **kwargs) + + # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_save_load_pretrained_default with Whisper->Clvp + def test_save_load_pretrained_default(self): + tokenizer = self.get_tokenizer() + feature_extractor = self.get_feature_extractor() + + processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + processor.save_pretrained(self.tmpdirname) + processor = ClvpProcessor.from_pretrained(self.tmpdirname) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) + self.assertIsInstance(processor.tokenizer, ClvpTokenizer) + + self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string()) + self.assertIsInstance(processor.feature_extractor, ClvpFeatureExtractor) + + # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_feature_extractor with Whisper->Clvp,processor(raw_speech->processor(raw_speech=raw_speech + def test_feature_extractor(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + raw_speech = floats_list((3, 1000)) + + input_feat_extract = feature_extractor(raw_speech, return_tensors="np") + input_processor = processor(raw_speech=raw_speech, return_tensors="np") + + for key in input_feat_extract.keys(): + self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) + + # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer with Whisper->Clvp + def test_tokenizer(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + input_str = "This is a test string" + + encoded_processor = processor(text=input_str) + + encoded_tok = tokenizer(input_str) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer_decode with Whisper->Clvp + def test_tokenizer_decode(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + def test_save_load_pretrained_additional_features(self): + processor = ClvpProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()) + processor.save_pretrained(self.tmpdirname) + + tokenizer_add_kwargs = self.get_tokenizer(pad_token="(PAD)") + feature_extractor_add_kwargs = self.get_feature_extractor(sampling_rate=16000) + + processor = ClvpProcessor.from_pretrained( + self.tmpdirname, + pad_token="(PAD)", + sampling_rate=16000, + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, ClvpTokenizer) + + self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.feature_extractor, ClvpFeatureExtractor) + + def test_model_input_names(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + self.assertListEqual( + sorted(processor.model_input_names), + sorted(set(feature_extractor.model_input_names + tokenizer.model_input_names)), + msg="`processor` and `feature_extractor` model input names do not match", + ) diff --git a/docs/transformers/tests/models/clvp/test_tokenization_clvp.py b/docs/transformers/tests/models/clvp/test_tokenization_clvp.py new file mode 100644 index 0000000000000000000000000000000000000000..eb7304d66f8ea967c22b7492c743abfa236c4a8c --- /dev/null +++ b/docs/transformers/tests/models/clvp/test_tokenization_clvp.py @@ -0,0 +1,317 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import unittest +from functools import lru_cache + +from transformers import ClvpTokenizer + +from ...test_tokenization_common import TokenizerTesterMixin, slow, use_cache_if_possible + + +class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "susnato/clvp_dev" + tokenizer_class = ClvpTokenizer + test_rust_tokenizer = False + from_pretrained_kwargs = {"add_prefix_space": True} + test_seq2seq = False + test_sentencepiece_ignore_case = True + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt + vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "\u0120", + "\u0120l", + "\u0120n", + "\u0120lo", + "\u0120low", + "er", + "\u0120lowest", + "\u0120newer", + "\u0120wider", + "", + "<|endoftext|>", + "[SPACE]", + ] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] + cls.special_tokens_map = {"unk_token": ""} + + cls.vocab_file = os.path.join(cls.tmpdirname, "vocab.json") + cls.merges_file = os.path.join(cls.tmpdirname, "merges.txt") + with open(cls.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + with open(cls.merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) + + # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return ClvpTokenizer.from_pretrained(pretrained_name, **kwargs) + + # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts + def get_input_output_texts(self, tokenizer): + input_text = "lower newer" + output_text = "lower[SPACE]newer" + return input_text, output_text + + # Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens + def test_add_special_tokens(self): + tokenizers: list[ClvpTokenizer] = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + special_token = "[SPECIAL_TOKEN]" + special_token_box = [1000, 1000, 1000, 1000] + + tokenizer.add_special_tokens({"cls_token": special_token}) + encoded_special_token = tokenizer.encode( + [special_token], boxes=[special_token_box], add_special_tokens=False + ) + self.assertEqual(len(encoded_special_token), 1) + + decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True) + self.assertTrue(special_token not in decoded) + + # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_rust_and_python_full_tokenizers + def test_rust_and_python_full_tokenizers(self): + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True) + + sequence = "lower newer" + + # Testing tokenization + tokens = tokenizer.tokenize(sequence, add_prefix_space=True) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(tokens, rust_tokens) + + # Testing conversion to ids without special tokens + ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True) + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + # Testing conversion to ids with special tokens + rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True) + ids = tokenizer.encode(sequence, add_prefix_space=True) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + # Testing the unknown token + input_tokens = tokens + [rust_tokenizer.unk_token] + input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] + self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_padding + def test_padding(self, max_length=15): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + + # Simple input + s = "This is a simple input" + s2 = ["This is a simple input 1", "This is a simple input 2"] + p = ("This is a simple input", "This is a pair") + p2 = [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ] + + # Simple input tests + self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + s2, + max_length=max_length, + padding="max_length", + ) + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + p2, + max_length=max_length, + padding="max_length", + ) + + # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_padding_if_pad_token_set_slow + def test_padding_if_pad_token_set_slow(self): + tokenizer = ClvpTokenizer.from_pretrained(self.tmpdirname, pad_token="") + + # Simple input + s = "This is a simple input" + s2 = ["This is a simple input looooooooong", "This is a simple input"] + p = ("This is a simple input", "This is a pair") + p2 = [ + ("This is a simple input loooooong", "This is a simple input"), + ("This is a simple pair loooooong", "This is a simple pair"), + ] + + pad_token_id = tokenizer.pad_token_id + + out_s = tokenizer(s, padding="max_length", max_length=30, return_tensors="np") + out_s2 = tokenizer(s2, padding=True, truncate=True, return_tensors="np") + out_p = tokenizer(*p, padding="max_length", max_length=60, return_tensors="np") + out_p2 = tokenizer(p2, padding=True, truncate=True, return_tensors="np") + + # s + # test single string max_length padding + self.assertEqual(out_s["input_ids"].shape[-1], 30) + self.assertTrue(pad_token_id in out_s["input_ids"]) + self.assertTrue(0 in out_s["attention_mask"]) + + # s2 + # test automatic padding + self.assertEqual(out_s2["input_ids"].shape[-1], 33) + # long slice doesn't have padding + self.assertFalse(pad_token_id in out_s2["input_ids"][0]) + self.assertFalse(0 in out_s2["attention_mask"][0]) + # short slice does have padding + self.assertTrue(pad_token_id in out_s2["input_ids"][1]) + self.assertTrue(0 in out_s2["attention_mask"][1]) + + # p + # test single pair max_length padding + self.assertEqual(out_p["input_ids"].shape[-1], 60) + self.assertTrue(pad_token_id in out_p["input_ids"]) + self.assertTrue(0 in out_p["attention_mask"]) + + # p2 + # test automatic padding pair + self.assertEqual(out_p2["input_ids"].shape[-1], 52) + # long slice pair doesn't have padding + self.assertFalse(pad_token_id in out_p2["input_ids"][0]) + self.assertFalse(0 in out_p2["attention_mask"][0]) + # short slice pair does have padding + self.assertTrue(pad_token_id in out_p2["input_ids"][1]) + self.assertTrue(0 in out_p2["attention_mask"][1]) + + # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_special_tokens_mask_input_pairs_and_bos_token + def test_special_tokens_mask_input_pairs_and_bos_token(self): + # TODO: change to self.get_tokenizers() when the fast version is implemented + tokenizers = [self.get_tokenizer(do_lower_case=False, add_bos_token=True)] + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + sequence_0 = "Encode this." + sequence_1 = "This one too please." + encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False) + encoded_sequence_dict = tokenizer.encode_plus( + sequence_0, + sequence_1, + add_special_tokens=True, + return_special_tokens_mask=True, + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + + filtered_sequence = [ + (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) + ] + filtered_sequence = [x for x in filtered_sequence if x is not None] + self.assertEqual(encoded_sequence, filtered_sequence) + + def test_token_type_ids(self): + tokenizer = self.get_tokenizer() + seq_0 = "Test this method." + + # We want to have sequence 0 and sequence 1 are tagged + # respectively with 0 and 1 token_ids + # (regardless of whether the model use token type ids) + # We use this assumption in the QA pipeline among other place + output = tokenizer(seq_0, return_token_type_ids=True, add_special_tokens=True) + self.assertIn(0, output["token_type_ids"]) + + def test_full_tokenizer(self): + tokenizer = ClvpTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) + text = "lower newer" + bpe_tokens = ["l", "o", "w", "er", "[SPACE]", "n", "e", "w", "er"] + tokens = tokenizer.tokenize(text, add_prefix_space=False) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [tokenizer.unk_token] + input_bpe_tokens = [0, 1, 2, 15, 21, 9, 3, 2, 15, 19] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + @slow + def test_outputs_with_numbers(self): + text = "hello and this is an example text and I have $1000. my lucky number is 12345." + tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev") + + # fmt: off + EXPECTED_OUTPUT = [62, 84, 28, 2, 53, 2,147, 2, 54, 2, 43, 2, 169, 122, 29, 64, 2, 136, 37, 33, 2, 53, 2, 22, + 2, 148, 2, 110, 2, 40, 206, 53, 2, 134, 84, 59, 32, 9, 2, 125, 2, 25, 34, 197, 38, 2, 27, + 231, 15, 44, 2, 54, 2, 33, 100, 25, 76, 2, 40, 206, 53, 7, 2, 40, 46, 18, 2, 21, 97, 17, + 219, 2, 87, 210, 8, 19, 22, 76, 9, + ] + # fmt: on + + self.assertListEqual(tokenizer.encode(text, add_special_tokens=False), EXPECTED_OUTPUT) + + @slow + def test_tokenizer_integration(self): + sequences = [ + "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides " + "general-purpose architectures (BERT, RoBERTa, XLM, DistilBert, XLNet...) for Natural " + "Language Understanding (NLU) and Natural Language Generation (NLG) with over multiple pretrained " + "models and deep interoperability between Jax, PyTorch and TensorFlow.", + "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly " + "conditioning on both left and right context in all layers.", + "The quick brown fox jumps over the lazy dog.", + ] + + # fmt: off + expected_encoding = {'input_ids': [[144, 43, 32, 87, 26, 173, 2, 5, 87, 26, 44, 70, 2, 209, 27, 2, 55, 2, 29, 38, 51, 31, 71, 8, 144, 43, 32, 87, 26, 173, 2, 53, 2, 29, 38, 51, 31, 71, 8, 29, 46, 144, 137, 49, 8, 15, 44, 33, 6, 2, 187, 35, 83, 61, 2, 20, 50, 44, 56, 8, 29, 121, 139, 66, 2, 59, 71, 60, 18, 16, 33, 34, 175, 2, 5, 15, 44, 33, 7, 2, 89, 15, 44, 33, 14, 7, 2, 37, 25, 26, 7, 2, 17, 54, 78, 25, 15, 44, 33, 7, 2, 37, 25, 111, 33, 9, 9, 9, 6, 2, 87, 2, 27, 48, 121, 56, 2, 25, 43, 20, 34, 14, 112, 2, 97, 234, 63, 53, 52, 2, 5, 27, 25, 34, 6, 2, 53, 2, 27, 48, 121, 56, 2, 25, 43, 20, 34, 14, 112, 2, 20, 50, 44, 158, 2, 5, 27, 25, 20, 6, 2, 103, 2, 253, 2, 26, 167, 78, 29, 64, 2, 29, 46, 144, 137, 49, 2, 115, 126, 25, 32, 2, 53, 2, 126, 18, 29, 2, 41, 114, 161, 44, 109, 151, 240, 2, 67, 33, 100, 50, 2, 23, 14, 37, 7, 2, 29, 38, 51, 31, 71, 2, 53, 2, 33, 50, 32, 57, 19, 25, 69, 9], [ 15, 44, 33, 2, 54, 2, 17, 61, 22, 20, 27, 49, 2, 51, 2, 29, 46, 8, 144, 137, 2, 126, 18, 29, 2, 15, 83, 22, 46, 16, 181, 56, 2, 46, 29, 175, 86, 158, 32, 2, 154, 2, 97, 25, 14, 67, 25, 49, 2, 136, 37, 33, 2, 185, 2, 23, 28, 41, 33, 70, 2, 135, 17, 60, 107, 52, 2, 47, 2, 165, 40, 2, 64, 19, 33, 2, 53, 2, 101, 104, 2, 135, 136, 37, 33, 2, 41, 2, 108, 2, 25, 88, 173, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 42, 2, 194, 91, 24, 2, 243, 190, 2, 182, 37, 2, 23, 231, 29, 32, 2, 253, 2, 42, 2, 25, 14, 39, 38, 2, 134, 20, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # noqa: E501 + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # noqa: E501 + } + # fmt: on + + self.tokenizer_integration_test_util( + sequences=sequences, expected_encoding=expected_encoding, model_name="susnato/clvp_dev", padding=True + ) diff --git a/docs/transformers/tests/models/code_llama/__init__.py b/docs/transformers/tests/models/code_llama/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/code_llama/test_tokenization_code_llama.py b/docs/transformers/tests/models/code_llama/test_tokenization_code_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..a12d2fc4c55e4be2f103d3d80166908f5a211c8c --- /dev/null +++ b/docs/transformers/tests/models/code_llama/test_tokenization_code_llama.py @@ -0,0 +1,653 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pickle +import shutil +import tempfile +import unittest + +from datasets import load_dataset + +from transformers import ( + SPIECE_UNDERLINE, + AddedToken, + CodeLlamaTokenizer, + CodeLlamaTokenizerFast, +) +from transformers.convert_slow_tokenizer import convert_slow_tokenizer +from transformers.testing_utils import ( + get_tests_dir, + nested_simplify, + require_sentencepiece, + require_tokenizers, + require_torch, + slow, +) + +from ...test_tokenization_common import TokenizerTesterMixin + + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + + +@require_sentencepiece +@require_tokenizers +class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "hf-internal-testing/llama-code-tokenizer" + tokenizer_class = CodeLlamaTokenizer + rust_tokenizer_class = CodeLlamaTokenizerFast + test_rust_tokenizer = False + test_sentencepiece = True + from_pretrained_kwargs = {} + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # We have a SentencePiece fixture for testing + tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True) + tokenizer.pad_token = tokenizer.eos_token + tokenizer.save_pretrained(cls.tmpdirname) + + def get_tokenizers(cls, **kwargs): + kwargs.update({"pad_token": ""}) + return super().get_tokenizers(**kwargs) + + def test_no_infilling_init(self): + tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, prefix_token=None, keep_accents=True) + with self.assertRaises(ValueError): + tokenizer.tokenize("This is prefix") + + def test_full_tokenizer(self): + tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True) + + tokens = tokenizer.tokenize("This is a test") + self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) + + self.assertListEqual( + tokenizer.convert_tokens_to_ids(tokens), + [285, 46, 10, 170, 382], + ) + + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") + self.assertListEqual( + tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "9", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "é", + ".", + ], + ) + ids = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual( + ids, + [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4], + ) + + back_tokens = tokenizer.convert_ids_to_tokens(ids) + self.assertListEqual( + back_tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "", + ".", + ], + ) + + def test_save_pretrained(self): + self.tokenizers_list = [ + (self.rust_tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}), + (self.tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}), + (self.tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}), + (self.rust_tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}), + ] + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + the tokenizer.json file for the fast one + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f) + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=True + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=False + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it saved the tokenizer.json file + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + @require_torch + def test_batch_tokenization(self): + if not self.test_seq2seq: + self.skipTest(reason="test_seq2seq is False") + + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Longer text that will definitely require truncation. + text = [ + " UN Chief Says There Is No Military Solution in Syria", + " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for" + " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons" + " will only worsen the violence and misery for millions of people.", + ] + try: + batch = tokenizer( + text=text, + max_length=3, + max_target_length=10, + return_tensors="pt", + ) + except NotImplementedError: + self.skipTest(reason="Encountered NotImplementedError when calling tokenizer") + self.assertEqual(batch.input_ids.shape[1], 3) + # max_target_length will default to max_length if not specified + batch = tokenizer(text, max_length=3, return_tensors="pt") + self.assertEqual(batch.input_ids.shape[1], 3) + + batch_encoder_only = tokenizer(text=text, max_length=3, max_target_length=10, return_tensors="pt") + self.assertEqual(batch_encoder_only.input_ids.shape[1], 3) + self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3) + self.assertNotIn("decoder_input_ids", batch_encoder_only) + + @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.") + def test_save_slow_from_fast_and_reload_fast(self): + pass + + def test_special_tokens_initialization(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + added_tokens = [AddedToken("", lstrip=True)] + + tokenizer_r = self.get_rust_tokenizer( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + r_output = tokenizer_r.encode("Hey this is a token") + + special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0] + + self.assertTrue(special_token_id in r_output) + + if self.test_slow_tokenizer: + tokenizer_cr = self.get_rust_tokenizer( + pretrained_name, + additional_special_tokens=added_tokens, + **kwargs, # , from_slow=True <- unfortunately too slow to convert + ) + tokenizer_p = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + + p_output = tokenizer_p.encode("Hey this is a token") + + cr_output = tokenizer_cr.encode("Hey this is a token") + + self.assertEqual(p_output, r_output) + self.assertEqual(cr_output, r_output) + self.assertTrue(special_token_id in p_output) + self.assertTrue(special_token_id in cr_output) + + @slow + def test_tokenizer_integration(self): + expected_encoding = {'input_ids': [[1, 4103, 689, 414, 313, 24784, 368, 2998, 408, 282, 3637, 25350, 29899, 9067, 414, 322, 282, 3637, 25350, 29899, 1457, 3018, 1312, 29899, 2151, 29897, 8128, 2498, 29899, 15503, 4220, 6956, 1973, 313, 13635, 29911, 29892, 402, 7982, 29899, 29906, 29892, 1528, 13635, 29911, 29874, 29892, 1060, 26369, 29892, 6652, 309, 29933, 814, 29892, 1060, 29931, 6779, 11410, 363, 18385, 17088, 7634, 11235, 313, 25103, 29965, 29897, 322, 18385, 17088, 28203, 313, 25103, 29954, 29897, 411, 975, 29871, 29941, 29906, 29974, 758, 3018, 1312, 4733, 297, 29871, 29896, 29900, 29900, 29974, 10276, 322, 6483, 1006, 3372, 3097, 1546, 435, 1165, 29892, 10772, 29911, 25350, 322, 323, 6073, 17907, 29889], [1, 350, 20161, 338, 8688, 304, 758, 29899, 14968, 6483, 21000, 8684, 284, 22540, 515, 443, 29880, 24025, 1426, 491, 14002, 368, 4195, 292, 373, 1716, 2175, 322, 1492, 3030, 297, 599, 15359, 29889], [1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 29889]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip + + self.tokenizer_integration_test_util( + expected_encoding=expected_encoding, + model_name="hf-internal-testing/llama-code-tokenizer", + revision="6eb30c03ab6a9e2cdef4d523024909ec815ddb75", + padding=False, + ) + + def test_picklable(self): + with tempfile.NamedTemporaryFile() as f: + shutil.copyfile(SAMPLE_VOCAB, f.name) + tokenizer = CodeLlamaTokenizer(f.name, keep_accents=True) + pickled_tokenizer = pickle.dumps(tokenizer) + pickle.loads(pickled_tokenizer) + + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") + def test_pickle_subword_regularization_tokenizer(self): + pass + + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") + def test_subword_regularization_tokenizer(self): + pass + + +@require_torch +@require_sentencepiece +@require_tokenizers +class LlamaIntegrationTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + checkpoint_name = "hf-internal-testing/llama-code-tokenizer" + cls.tokenizer: CodeLlamaTokenizer = CodeLlamaTokenizer.from_pretrained(checkpoint_name) + cls.rust_tokenizer = CodeLlamaTokenizerFast.from_pretrained(checkpoint_name) + return cls + + @require_torch + def integration_tests(self): + inputs = self.tokenizer( + ["The following string should be properly encoded: Hello.", "But ird and ปี ird ด"], + return_tensors="pt", + ) + + self.assertEqual( + nested_simplify(inputs), + { + "input_ids": [ + [1, 450, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889], + [1, 1205, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718], + ], + "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], + }, + ) + + def test_fast_special_tokens(self): + slow_tokenizer = self.tokenizer + fast_tokenizer = self.rust_tokenizer + slow = slow_tokenizer.encode("A sample test", add_special_tokens=True) + assert slow == [1, 319, 4559, 1243] + + fast_tokenizer.add_eos_token = False + fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) + assert fast == [1, 319, 4559, 1243] + + fast_tokenizer.add_eos_token = True + fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) + assert fast == [1, 319, 4559, 1243, 2] + + slow_tokenizer.add_eos_token = True + slow = slow_tokenizer.encode("A sample test", add_special_tokens=True) + assert slow == [1, 319, 4559, 1243, 2] + + fast_tokenizer = CodeLlamaTokenizerFast.from_pretrained( + "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False + ) + fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) + assert fast == [319, 4559, 1243, 2] + + slow_tokenizer = CodeLlamaTokenizer.from_pretrained( + "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False + ) + slow = slow_tokenizer.encode("A sample test", add_special_tokens=True) + assert slow == [319, 4559, 1243, 2] + + self.tokenizer.add_eos_token = False + self.rust_tokenizer.add_eos_token = False + + @slow + def test_conversion(self): + # This is excruciatingly slow since it has to recreate the entire merge + # list from the original vocabulary in spm + self.rust_tokenizer.save_pretrained("./out") + with tempfile.TemporaryDirectory() as dirname: + self.rust_tokenizer.save_pretrained(dirname) + + with open(os.path.join(dirname, "tokenizer.json")) as f: + old_serialized = f.read() + + new_tokenizer = convert_slow_tokenizer(self.tokenizer) + with tempfile.NamedTemporaryFile() as f: + new_tokenizer.save(f.name) + # Re-opening since `f` is in bytes. + new_serialized = open(f.name).read() + with open("out_tokenizer.json", "w") as g: + g.write(new_serialized) + + self.assertEqual(old_serialized, new_serialized) + + def test_simple_encode_decode(self): + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + + self.assertEqual(pyth_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243]) + self.assertEqual(rust_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243]) + self.assertEqual(pyth_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test") + self.assertEqual(rust_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test") + + # bytefallback showcase + self.assertEqual(pyth_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392]) # fmt: skip + self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392]) # fmt: skip + self.assertEqual( + pyth_tokenizer.decode( + [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True + ), + "生活的真谛是", + ) + self.assertEqual( + rust_tokenizer.decode( + [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True + ), + "生活的真谛是", + ) + + # Inner spaces showcase + self.assertEqual(pyth_tokenizer.encode("Hi Hello"), [1, 6324, 29871, 15043]) + self.assertEqual(rust_tokenizer.encode("Hi Hello"), [1, 6324, 29871, 15043]) + self.assertEqual(pyth_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi Hello") + self.assertEqual(rust_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi Hello") + + self.assertEqual(pyth_tokenizer.encode("Hi Hello"), [1, 6324, 259, 15043]) + self.assertEqual(rust_tokenizer.encode("Hi Hello"), [1, 6324, 259, 15043]) + self.assertEqual(pyth_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi Hello") + self.assertEqual(rust_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi Hello") + + self.assertEqual(pyth_tokenizer.encode(""), [1]) + self.assertEqual(rust_tokenizer.encode(""), [1]) + + self.assertEqual(pyth_tokenizer.encode(" "), [1, 259]) + self.assertEqual(rust_tokenizer.encode(" "), [1, 259]) + + self.assertEqual(pyth_tokenizer.encode(" "), [1, 1678]) + self.assertEqual(rust_tokenizer.encode(" "), [1, 1678]) + + self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043]) + self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043]) + + def test_no_differences_showcase(self): + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + self.assertEqual(pyth_tokenizer.encode(""), [1]) + self.assertEqual(rust_tokenizer.encode(""), [1]) + + self.assertEqual(pyth_tokenizer.encode(" "), [1, 259]) + self.assertEqual(rust_tokenizer.encode(" "), [1, 259]) + + self.assertEqual(pyth_tokenizer.encode(" "), [1, 1678]) + self.assertEqual(rust_tokenizer.encode(" "), [1, 1678]) + + self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043]) + self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043]) + + self.assertEqual(pyth_tokenizer.encode(""), [1, 1]) + self.assertEqual(rust_tokenizer.encode(""), [1, 1]) + + def test_no_differences_decode(self): + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + + self.assertEqual(pyth_tokenizer.decode([869]), ".") + self.assertEqual(rust_tokenizer.decode([869]), ".") + + self.assertEqual(pyth_tokenizer.decode([30112, 869]), "ا .") + self.assertEqual(rust_tokenizer.decode([30112, 869]), "ا .") + + def test_no_differences_special_tokens(self): + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + self.assertEqual(pyth_tokenizer.encode(""), [1]) + self.assertEqual(rust_tokenizer.encode(""), [1]) + + self.assertEqual(pyth_tokenizer.encode(""), [1, 1]) + self.assertEqual(rust_tokenizer.encode(""), [1, 1]) + + @unittest.skipIf( + os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0", + "RUN_TOKENIZER_INTEGRATION=1 to run tokenizer integration tests", + ) + def test_integration_test_xnli(self): + import tqdm + + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + + dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go") + for item in tqdm.tqdm(dataset["validation"]): + string = item["code"] + encoded1 = pyth_tokenizer.encode(string) + encoded2 = rust_tokenizer.encode(string) + + self.assertEqual(encoded1, encoded2) + + decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True) + decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True) + + self.assertEqual(decoded1, decoded2) + + dataset = load_dataset("facebook/xnli", "all_languages") + + for item in tqdm.tqdm(dataset["train"]): + for string in item["premise"].values(): + encoded1 = pyth_tokenizer.encode(string) + encoded2 = rust_tokenizer.encode(string) + + self.assertEqual(encoded1, encoded2) + + decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True) + decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True) + + self.assertEqual(decoded1, decoded2) + + def test_special_token_special_word(self): + # the word inform should be split as ['in', 'form'] + tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", legacy=False) + tokenizer.add_tokens([AddedToken("", rstrip=True, lstrip=True)], special_tokens=False) + out1 = tokenizer.decode( + tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False + ) + self.assertEqual(out1, "inform") + out2 = tokenizer.decode( + tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True + ) + # the added prefix token should not be decoded + self.assertEqual(out2, " inform") + input_ids = tokenizer.encode("inform", add_special_tokens=False) + self.assertEqual(input_ids, [29871, 32016, 262, 689]) # 29871 is the spiece underline, '▁' + + out2 = tokenizer.decode( + tokenizer.encode(" inform", add_special_tokens=False), spaces_between_special_tokens=False + ) + # TODO @ArthurZ currently we strip left and right, so this will not keep the spaces + self.assertEqual(out2, "inform") + + ### Let's make sure decoding does not add extra spaces here and there + # TODO @ArthurZ this should be affected by the lstrip/rstrip/single word /normalize refactoring + # Since currently we always strip left and right of the token, results are as such + input_ids = tokenizer.encode(" Hellohow", add_special_tokens=False) + self.assertEqual(input_ids, [1, 15043, 1, 3525]) + tokens = tokenizer.tokenize(" Hellohow", add_special_tokens=False) + self.assertEqual(tokens, ["", "▁Hello", "", "how"]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, " Hellohow") + + # Let's make sure that if there are any spaces, we don't remove them! + input_ids = tokenizer.encode(" Hello how", add_special_tokens=False) + self.assertEqual(input_ids, [259, 1, 15043, 1, 920]) + tokens = tokenizer.tokenize(" Hello how", add_special_tokens=False) + self.assertEqual(tokens, ["▁▁", "", "▁Hello", "", "▁how"]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, " Hello how") + + def test_fill_token(self): + tokenizer = CodeLlamaTokenizerFast.from_pretrained( + "codellama/CodeLlama-7b-hf", fill_token=None, prefix_token=None, suffix_token=None, middle_token=None + ) + tokenizer.encode_plus("Hey how are you").input_ids + tokenizer.fill_token = "" + with self.assertRaises(ValueError): + tokenizer.encode("Hey how are you") + tokenizer.encode_plus("Hey how are you", "mne too") + tokenizer.tokenize("Hey how are you", "mne too") + + tokenizer = CodeLlamaTokenizerFast.from_pretrained( + "codellama/CodeLlama-7b-hf", revision="3773f63b4511b9e47a9a7ffc765eed7eb0169486" + ) + tokenizer.encode("Hey how are you") + tokenizer.encode_plus("Hey how are you", "mne too") + tokenizer.tokenize("Hey how are you", "mne too") + + def test_spm_edge_cases(self): + # the word inform should be split as ['in', 'form'] + tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", legacy=False) + tokens = tokenizer.tokenize("[INST] How are you doing?[/INST]") + self.assertEqual( + tokens, ["▁[", "INST", "]", "▁How", "▁are", "▁you", "▁doing", "?", "", "[", "/", "INST", "]"] + ) + inputs_ids = tokenizer.encode("[INST] How are you doing?[/INST]") + self.assertEqual( + inputs_ids, [1, 518, 25580, 29962, 1128, 526, 366, 2599, 29973, 1, 29961, 29914, 25580, 29962] + ) + + def test_infilling_tokenization(self): + PROMPTS = [ + '''def remove_non_ascii(s: str) -> str: + """ + return result +''', + """# Installation instructions: + ```bash + + ``` +This downloads the LLaMA inference code and installs the repository as a local pip package. +""", + """class InterfaceManagerFactory(AbstractManagerFactory): + def __init__( +def main(): + factory = InterfaceManagerFactory(start=datetime.now()) + managers = [] + for i in range(10): + managers.append(factory.build(id=i)) +""", + """/-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/ +theorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) : +π₁ P = 0 ↔ = 0 := +begin +split, +{ intros h f, + rw pi_1_etalisation at h, + simp [h], + refl +}, +{ intro h, + have := @quasi_adjoint C D P, + simp [←pi_1_etalisation, this, h], + refl +} +end +""", + ] + tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf") + tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-Instruct-hf") + + formatted_prompt = tokenizer.tokenize(PROMPTS[0]) + self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0])) + prefix, suffix = PROMPTS[0].split("") + self.assertEqual(formatted_prompt, tokenizer.tokenize(prefix, suffix)) + self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(prefix, suffix)) + + input_ids = tokenizer.encode(PROMPTS[0], add_special_tokens=False) + self.assertEqual(input_ids, tokenizer_fast.encode(PROMPTS[0], add_special_tokens=False)) + + prefix, suffix = PROMPTS[0].split("") + input_ids = tokenizer.encode(PROMPTS[0]) + self.assertEqual(input_ids, tokenizer.encode(prefix, suffix=suffix)) + self.assertEqual(tokenizer.encode(prefix, suffix=suffix), tokenizer_fast.encode(prefix, suffix=suffix)) + + # Adding suffix_first check for infilling tasks + suffix_first_formatted_prompt = tokenizer.tokenize(PROMPTS[0], suffix_first=True) + self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0], suffix_first=True)) + prefix, suffix = PROMPTS[0].split("") + self.assertEqual(suffix_first_formatted_prompt, tokenizer.tokenize(prefix, suffix, suffix_first=True)) + self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(prefix, suffix, suffix_first=True)) + + prefix, suffix = PROMPTS[0].split("") + suffix_first_input_ids = tokenizer.encode(PROMPTS[0], suffix_first=True) + self.assertEqual(suffix_first_input_ids, tokenizer.encode(prefix, suffix=suffix, suffix_first=True)) + self.assertEqual(suffix_first_input_ids, tokenizer_fast.encode(prefix, suffix=suffix, suffix_first=True)) diff --git a/docs/transformers/tests/models/codegen/__init__.py b/docs/transformers/tests/models/codegen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/codegen/test_modeling_codegen.py b/docs/transformers/tests/models/codegen/test_modeling_codegen.py new file mode 100644 index 0000000000000000000000000000000000000000..78f766a52acdc4c8f8d217a078ce6dd99333aed3 --- /dev/null +++ b/docs/transformers/tests/models/codegen/test_modeling_codegen.py @@ -0,0 +1,492 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import CodeGenConfig, is_torch_available +from transformers.file_utils import cached_property +from transformers.testing_utils import backend_manual_seed, require_torch, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import AutoTokenizer, CodeGenForCausalLM, CodeGenModel + + +class CodeGenModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=256, + hidden_size=32, + rotary_dim=4, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.use_mc_token_ids = use_mc_token_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.rotary_dim = rotary_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = None + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + def get_large_model_config(self): + return CodeGenConfig.from_pretrained("Salesforce/codegen-2B-mono") + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def get_config(self): + return CodeGenConfig( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + use_cache=True, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + rotary_dim=self.rotary_dim, + ) + + def create_and_check_codegen_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = CodeGenModel(config=config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(len(result.past_key_values), config.n_layer) + + def create_and_check_codegen_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = CodeGenModel(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) + outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids) + outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False) + + self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) + self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + + output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] + output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_codegen_model_attention_mask_past( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = CodeGenModel(config=config) + model.to(torch_device) + model.eval() + + # create attention mask + attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) + half_seq_length = self.seq_length // 2 + attn_mask[:, half_seq_length:] = 0 + + # first forward pass + output, past = model(input_ids, attention_mask=attn_mask).to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + # change a random masked slice from input_ids + random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1 + random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1) + input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens + + # append to next input_ids and attn_mask + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + attn_mask = torch.cat( + [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], + dim=1, + ) + + # get two different outputs + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_codegen_model_past_large_inputs( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = CodeGenModel(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model( + next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask + )["last_hidden_state"] + output_from_past = model( + next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past + )["last_hidden_state"] + self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1]) + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = CodeGenForCausalLM(config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_forward_and_backwards( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False + ): + model = CodeGenForCausalLM(config) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + model.to(torch_device) + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + result.loss.backward() + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask} + + return config, inputs_dict + + +@require_torch +class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (CodeGenModel, CodeGenForCausalLM) if is_torch_available() else () + pipeline_model_mapping = ( + {"feature-extraction": CodeGenModel, "text-generation": CodeGenForCausalLM} if is_torch_available() else {} + ) + fx_compatible = False + test_pruning = False + test_missing_keys = False + test_model_parallel = False + test_head_masking = False + + # special case for DoubleHeads model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + return inputs_dict + + def setUp(self): + self.model_tester = CodeGenModelTester(self) + self.config_tester = ConfigTester(self, config_class=CodeGenConfig, n_embd=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_codegen_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_codegen_model(*config_and_inputs) + + def test_codegen_model_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_codegen_model_past(*config_and_inputs) + + def test_codegen_model_att_mask_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_codegen_model_attention_mask_past(*config_and_inputs) + + def test_codegen_model_past_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_codegen_model_past_large_inputs(*config_and_inputs) + + def test_codegen_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + def test_codegen_gradient_checkpointing(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True) + + @slow + def test_batch_generation(self): + tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono") + model = CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono") + model.to(torch_device) + + tokenizer.padding_side = "left" + + # Define PAD Token = EOS Token = 50256 + tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = model.config.eos_token_id + + # use different length sentences to test batching + sentences = ["def hellow_world():", "def greet(name):"] + + inputs = tokenizer(sentences, return_tensors="pt", padding=True) + input_ids = inputs["input_ids"].to(torch_device) + token_type_ids = torch.cat( + [ + input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0), + input_ids.new_full((input_ids.shape[0], 1), 500), + ], + dim=-1, + ) + + outputs = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + ) + + outputs_tt = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + token_type_ids=token_type_ids, + ) + + inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device) + output_non_padded = model.generate(input_ids=inputs_non_padded) + + num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item() + inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device) + output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) + + batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) + batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True) + non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) + padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) + + expected_output_sentence = [ + 'def hellow_world():\n print("Hello World")\n\nhellow_world()', + 'def greet(name):\n print(f"Hello {name}")\n\ng', + ] + self.assertListEqual(expected_output_sentence, batch_out_sentence) + self.assertTrue(batch_out_sentence_tt != batch_out_sentence) # token_type_ids should change output + self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) + + @slow + def test_model_from_pretrained(self): + model_name = "Salesforce/codegen-350M-nl" + model = CodeGenModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_torch +class CodeGenModelLanguageGenerationTest(unittest.TestCase): + @cached_property + def cached_tokenizer(self): + return AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono") + + @cached_property + def cached_model(self): + return CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono") + + @slow + def test_lm_generate_codegen(self): + tokenizer = self.cached_tokenizer + for checkpointing in [True, False]: + model = self.cached_model + + if checkpointing: + model.gradient_checkpointing_enable() + else: + model.gradient_checkpointing_disable() + model.to(torch_device) + + inputs = tokenizer("def hello_world():", return_tensors="pt").to(torch_device) + expected_output = 'def hello_world():\n print("Hello World")\n\nhello_world()\n\n' + + output_ids = model.generate(**inputs, do_sample=False) + output_str = tokenizer.batch_decode(output_ids)[0] + + self.assertEqual(output_str, expected_output) + + @slow + def test_codegen_sample(self): + tokenizer = self.cached_tokenizer + model = self.cached_model + model.to(torch_device) + + torch.manual_seed(0) + backend_manual_seed(torch_device, 0) + + tokenized = tokenizer("def hello_world():", return_tensors="pt", return_token_type_ids=True) + input_ids = tokenized.input_ids.to(torch_device) + output_ids = model.generate(input_ids, do_sample=True) + output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True) + + token_type_ids = tokenized.token_type_ids.to(torch_device) + output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5) + output_seq_tt = model.generate( + input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5 + ) + output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True) + output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True) + + if torch_device == "cuda": + EXPECTED_OUTPUT_STR = 'def hello_world():\n print("Hello World")\n return True\n\nresult =' + else: + EXPECTED_OUTPUT_STR = "def hello_world():\r\n print('Hello, World.')\r\n\r\n\r" + + self.assertEqual(output_str, EXPECTED_OUTPUT_STR) + self.assertTrue( + all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))) + ) # token_type_ids should change output diff --git a/docs/transformers/tests/models/codegen/test_tokenization_codegen.py b/docs/transformers/tests/models/codegen/test_tokenization_codegen.py new file mode 100644 index 0000000000000000000000000000000000000000..a0ea547566cd6abefbe5910c1d833019ca3183bb --- /dev/null +++ b/docs/transformers/tests/models/codegen/test_tokenization_codegen.py @@ -0,0 +1,329 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import re +import unittest +from functools import lru_cache + +from transformers import CodeGenTokenizer, CodeGenTokenizerFast +from transformers.models.codegen.tokenization_codegen import VOCAB_FILES_NAMES +from transformers.testing_utils import require_tokenizers, slow + +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible + + +@require_tokenizers +class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "Salesforce/codegen-350M-mono" + tokenizer_class = CodeGenTokenizer + rust_tokenizer_class = CodeGenTokenizerFast + test_rust_tokenizer = True + from_pretrained_kwargs = {"add_prefix_space": True} + test_seq2seq = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt + vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "\u0120", + "\u0120l", + "\u0120n", + "\u0120lo", + "\u0120low", + "er", + "\u0120lowest", + "\u0120newer", + "\u0120wider", + "", + "<|endoftext|>", + ] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] + cls.special_tokens_map = {"unk_token": ""} + + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + with open(cls.merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return CodeGenTokenizer.from_pretrained(pretrained_name, **kwargs) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return CodeGenTokenizerFast.from_pretrained(pretrained_name, **kwargs) + + def get_input_output_texts(self, tokenizer): + input_text = "lower newer" + output_text = "lower newer" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = CodeGenTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) + text = "lower newer" + bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] + tokens = tokenizer.tokenize(text, add_prefix_space=True) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [tokenizer.unk_token] + input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + def test_rust_and_python_full_tokenizers(self): + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True) + + sequence = "lower newer" + + # Testing tokenization + tokens = tokenizer.tokenize(sequence, add_prefix_space=True) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(tokens, rust_tokens) + + # Testing conversion to ids without special tokens + ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True) + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + # Testing conversion to ids with special tokens + rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True) + ids = tokenizer.encode(sequence, add_prefix_space=True) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + # Testing the unknown token + input_tokens = tokens + [rust_tokenizer.unk_token] + input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] + self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + @unittest.skip + def test_pretokenized_inputs(self, *args, **kwargs): + # It's very difficult to mix/test pretokenization with byte-level + # And get both CodeGen and Roberta to work at the same time (mostly an issue of adding a space before the string) + pass + + def test_padding(self, max_length=15): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + + # Simple input + s = "This is a simple input" + s2 = ["This is a simple input 1", "This is a simple input 2"] + p = ("This is a simple input", "This is a pair") + p2 = [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ] + + # Simple input tests + self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + s2, + max_length=max_length, + padding="max_length", + ) + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + p2, + max_length=max_length, + padding="max_length", + ) + + def test_padding_if_pad_token_set_slow(self): + tokenizer = CodeGenTokenizer.from_pretrained(self.tmpdirname, pad_token="") + + # Simple input + s = "This is a simple input" + s2 = ["This is a simple input looooooooong", "This is a simple input"] + p = ("This is a simple input", "This is a pair") + p2 = [ + ("This is a simple input loooooong", "This is a simple input"), + ("This is a simple pair loooooong", "This is a simple pair"), + ] + + pad_token_id = tokenizer.pad_token_id + + out_s = tokenizer(s, padding="max_length", max_length=30, return_tensors="np") + out_s2 = tokenizer(s2, padding=True, truncate=True, return_tensors="np") + out_p = tokenizer(*p, padding="max_length", max_length=60, return_tensors="np") + out_p2 = tokenizer(p2, padding=True, truncate=True, return_tensors="np") + + # s + # test single string max_length padding + self.assertEqual(out_s["input_ids"].shape[-1], 30) + self.assertTrue(pad_token_id in out_s["input_ids"]) + self.assertTrue(0 in out_s["attention_mask"]) + + # s2 + # test automatic padding + self.assertEqual(out_s2["input_ids"].shape[-1], 33) + # long slice doesn't have padding + self.assertFalse(pad_token_id in out_s2["input_ids"][0]) + self.assertFalse(0 in out_s2["attention_mask"][0]) + # short slice does have padding + self.assertTrue(pad_token_id in out_s2["input_ids"][1]) + self.assertTrue(0 in out_s2["attention_mask"][1]) + + # p + # test single pair max_length padding + self.assertEqual(out_p["input_ids"].shape[-1], 60) + self.assertTrue(pad_token_id in out_p["input_ids"]) + self.assertTrue(0 in out_p["attention_mask"]) + + # p2 + # test automatic padding pair + self.assertEqual(out_p2["input_ids"].shape[-1], 52) + # long slice pair doesn't have padding + self.assertFalse(pad_token_id in out_p2["input_ids"][0]) + self.assertFalse(0 in out_p2["attention_mask"][0]) + # short slice pair does have padding + self.assertTrue(pad_token_id in out_p2["input_ids"][1]) + self.assertTrue(0 in out_p2["attention_mask"][1]) + + def test_add_bos_token_slow(self): + bos_token = "$$$" + tokenizer = CodeGenTokenizer.from_pretrained(self.tmpdirname, bos_token=bos_token, add_bos_token=True) + + s = "This is a simple input" + s2 = ["This is a simple input 1", "This is a simple input 2"] + + bos_token_id = tokenizer.bos_token_id + + out_s = tokenizer(s) + out_s2 = tokenizer(s2) + + self.assertEqual(out_s.input_ids[0], bos_token_id) + self.assertTrue(all(o[0] == bos_token_id for o in out_s2.input_ids)) + + decode_s = tokenizer.decode(out_s.input_ids) + decode_s2 = tokenizer.batch_decode(out_s2.input_ids) + + self.assertTrue(decode_s.startswith(bos_token)) + self.assertTrue(all(d.startswith(bos_token) for d in decode_s2)) + + @slow + def test_truncation(self): + tokenizer = CodeGenTokenizer.from_pretrained("Salesforce/codegen-350M-mono") + + text = "\nif len_a > len_b:\n result = a\nelse:\n result = b\n\n\n\n#" + expected_truncated_text = "\nif len_a > len_b:\n result = a\nelse:\n result = b" + + input_ids = tokenizer.encode(text) + truncation_pattern = ["^#", re.escape("<|endoftext|>"), "^'''", '^"""', "\n\n\n"] + decoded_text = tokenizer.decode(input_ids, truncate_before_pattern=truncation_pattern) + self.assertEqual(decoded_text, expected_truncated_text) + # TODO @ArthurZ outputs of the fast tokenizer are different in this case, un-related to the PR + + # tokenizer has no padding token + @unittest.skip(reason="tokenizer has no padding token") + def test_padding_different_model_input_name(self): + pass + + @slow + def test_tokenizer_integration(self): + # Custom test since this tokenizer takes return_token_type_ids as an init argument for backward compatibility. + + sequences = [ + "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides " + "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural " + "Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained " + "models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.", + "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly " + "conditioning on both left and right context in all layers.", + "The quick brown fox jumps over the lazy dog.", + ] + + tokenizer_classes = [self.tokenizer_class] + if self.test_rust_tokenizer: + tokenizer_classes.append(self.rust_tokenizer_class) + + # Test default case. i.e. return_token_type_ids is False. + for tokenizer_class in tokenizer_classes: + tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono") + + encoding = tokenizer(sequences) + decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]] + + # fmt: off + expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501 + # fmt: on + + encoding_data = encoding.data + self.assertDictEqual(encoding_data, expected_encoding) + + for expected, decoded in zip(sequences, decoded_sequences): + self.assertEqual(expected, decoded) + + # Test return_token_type_ids is True case. + for tokenizer_class in tokenizer_classes: + tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono", return_token_type_ids=True) + + encoding = tokenizer(sequences) + decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]] + + # fmt: off + expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501 + # fmt: on + + encoding_data = encoding.data + self.assertDictEqual(encoding_data, expected_encoding) + + for expected, decoded in zip(sequences, decoded_sequences): + self.assertEqual(expected, decoded) diff --git a/docs/transformers/tests/models/cohere/__init__.py b/docs/transformers/tests/models/cohere/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/cohere/test_modeling_cohere.py b/docs/transformers/tests/models/cohere/test_modeling_cohere.py new file mode 100644 index 0000000000000000000000000000000000000000..bebafedc7df83dc84bf51fd850ccca827106ad31 --- /dev/null +++ b/docs/transformers/tests/models/cohere/test_modeling_cohere.py @@ -0,0 +1,254 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Cohere model.""" + +import unittest + +from transformers import CohereConfig, is_torch_available +from transformers.testing_utils import ( + require_bitsandbytes, + require_torch, + require_torch_multi_gpu, + require_torch_sdpa, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import AutoTokenizer, CohereForCausalLM, CohereModel + + +# Copied from transformers.tests.models.llama.LlamaModelTester with Llama->Cohere +class CohereModelTester: + config_class = CohereConfig + if is_torch_available(): + model_class = CohereModel + for_causal_lm_class = CohereForCausalLM + + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=4, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + # Ignore copy + def get_config(self): + return self.config_class( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = self.model_class(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class CohereModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (CohereModel, CohereForCausalLM) if is_torch_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": CohereModel, + "text-generation": CohereForCausalLM, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + fx_compatible = False + + # Need to use `0.8` instead of `0.9` for `test_cpu_offload` + # This is because we are hitting edge cases with the causal_mask buffer + model_split_percents = [0.5, 0.7, 0.8] + + def setUp(self): + self.model_tester = CohereModelTester(self) + self.config_tester = ConfigTester(self, config_class=CohereConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + + +@require_torch +@slow +class CohereIntegrationTest(unittest.TestCase): + @require_torch_multi_gpu + @require_bitsandbytes + def test_batched_4bit(self): + model_id = "CohereForAI/c4ai-command-r-v01-4bit" + + EXPECTED_TEXT = [ + 'Hello today I am going to show you how to make a simple and easy card using the new stamp set called "Hello" from the Occasions catalog. This set is so versatile and can be used for many occasions. I used the new In', + "Hi there, here we are again with another great collection of free fonts for your next project. This time we have gathered 10 free fonts that you can download and use in your designs. These fonts are perfect for any kind", + ] + + model = CohereForCausalLM.from_pretrained(model_id, device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(model_id) + + tokenizer.pad_token = tokenizer.eos_token + + text = ["Hello today I am going to show you how to", "Hi there, here we are"] + inputs = tokenizer(text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=40, do_sample=False) + self.assertEqual(tokenizer.batch_decode(output, skip_special_tokens=True), EXPECTED_TEXT) + + @require_torch_sdpa + def test_batched_small_model_logits(self): + # Since the model is very large, we created a random cohere model so that we can do a simple + # logits check on it. + model_id = "hf-internal-testing/cohere-random" + + EXPECTED_LOGITS = torch.Tensor( + [ + [[0.0000, 0.1866, -0.1997], [0.0000, -0.0736, 0.1785], [0.0000, -0.1965, -0.0569]], + [[0.0000, -0.0302, 0.1488], [0.0000, -0.0402, 0.1351], [0.0000, -0.0341, 0.1116]], + ] + ).to(device=torch_device, dtype=torch.float16) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = CohereForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( + torch_device + ) + + tokenizer.pad_token = tokenizer.eos_token + + text = ["Hello today I am going to show you how to", "Hi there, here we are"] + inputs = tokenizer(text, return_tensors="pt", padding=True).to(torch_device) + + with torch.no_grad(): + output = model(**inputs) + + logits = output.logits + torch.testing.assert_close(EXPECTED_LOGITS, logits[:, :3, :3], rtol=1e-3, atol=1e-3) diff --git a/docs/transformers/tests/models/cohere/test_tokenization_cohere.py b/docs/transformers/tests/models/cohere/test_tokenization_cohere.py new file mode 100644 index 0000000000000000000000000000000000000000..d162b999819094513e33524d780c4ddcaaa3d2ae --- /dev/null +++ b/docs/transformers/tests/models/cohere/test_tokenization_cohere.py @@ -0,0 +1,296 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import unittest +from functools import lru_cache + +from transformers import CohereTokenizerFast +from transformers.testing_utils import require_jinja, require_tokenizers, require_torch_multi_gpu + +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible + + +@require_tokenizers +class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + slow_tokenizer_class = None + rust_tokenizer_class = CohereTokenizerFast + tokenizer_class = CohereTokenizerFast + test_rust_tokenizer = True + test_slow_tokenizer = False + from_pretrained_vocab_key = "tokenizer_file" + from_pretrained_id = "hf-internal-testing/tiny-random-CohereForCausalLM" + special_tokens_map = { + "bos_token": "", + "eos_token": "<|END_OF_TURN_TOKEN|>", + "unk_token": "", + "pad_token": "", + } + + @classmethod + def setUpClass(cls): + super().setUpClass() + tokenizer = CohereTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-CohereForCausalLM") + tokenizer.save_pretrained(cls.tmpdirname) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + _kwargs = copy.deepcopy(cls.special_tokens_map) + _kwargs.update(kwargs) + kwargs = _kwargs + pretrained_name = pretrained_name or cls.tmpdirname + return CohereTokenizerFast.from_pretrained(pretrained_name, **kwargs) + + # This gives CPU OOM on a single-gpu runner (~60G RAM). On multi-gpu runner, it has ~180G RAM which is enough. + @require_torch_multi_gpu + def test_torch_encode_plus_sent_to_model(self): + super().test_torch_encode_plus_sent_to_model() + + @unittest.skip(reason="This needs a slow tokenizer. Cohere does not have one!") + def test_encode_decode_with_spaces(self): + return + + def test_encodings_from_sample_data(self): + """ + Assert that the created tokens are the same than the hard-coded ones + """ + tokenizer = self.get_rust_tokenizer() + + INPUT_SENTENCES = ["The quick brown fox<|END_OF_TURN_TOKEN|>", "jumps over the lazy dog<|END_OF_TURN_TOKEN|>"] + TARGET_TOKENS = [ + [5, 60, 203, 746, 666, 980, 571, 222, 87, 96, 8], + [5, 82, 332, 88, 91, 544, 206, 257, 930, 97, 239, 435, 8], + ] + + computed_tokens = tokenizer.batch_encode_plus(INPUT_SENTENCES)["input_ids"] + self.assertListEqual(TARGET_TOKENS, computed_tokens) + + INPUT_SENTENCES_W_BOS = [ + "The quick brown fox<|END_OF_TURN_TOKEN|>", + "jumps over the lazy dog<|END_OF_TURN_TOKEN|>", + ] + decoded_tokens = tokenizer.batch_decode(computed_tokens) + self.assertListEqual(decoded_tokens, INPUT_SENTENCES_W_BOS) + + def test_padding(self, max_length=10): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + # tokenizer_r.pad_token = None # Hotfixing padding = None + # Simple input + s = "This is a simple input" + s2 = ["This is a simple input 1", "This is a simple input 2"] + p = ("This is a simple input", "This is a pair") + p2 = [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ] + + # Simple input tests + try: + tokenizer_r.encode(s, max_length=max_length) + tokenizer_r.encode_plus(s, max_length=max_length) + + tokenizer_r.batch_encode_plus(s2, max_length=max_length) + tokenizer_r.encode(p, max_length=max_length) + tokenizer_r.batch_encode_plus(p2, max_length=max_length) + except ValueError: + self.fail("Cohere Tokenizer should be able to deal with padding") + + tokenizer_r.pad_token = None # Hotfixing padding = None + self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + s2, + max_length=max_length, + padding="max_length", + ) + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + p2, + max_length=max_length, + padding="max_length", + ) + + def test_pretrained_model_lists(self): + # No `max_model_input_sizes` for Cohere model + self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1) + self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1) + + @require_jinja + def test_tokenization_for_chat(self): + tokenizer = self.get_rust_tokenizer() + test_chats = [ + [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], + [ + {"role": "system", "content": "You are a helpful chatbot."}, + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Nice to meet you."}, + ], + ] + tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] + # fmt: off + expected_tokens = [ + [5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65, 59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59, 45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8], + [5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65, + 59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8, + 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59, + 45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, + 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 43, 48, 41, 60, 42, 55, 60, 71, 60, 55, 51, 45, 54, 99, 38, + 54, 567, 235, 693, 276, 411, 243, 22, 8] + ] + # fmt: on + for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): + self.assertListEqual(tokenized_chat, expected_tokens) + + @require_jinja + def test_tokenization_for_tool_use(self): + tokenizer = self.get_rust_tokenizer() + + conversation = [{"role": "user", "content": "Whats the biggest penguin in the world?"}] + + tools = [ + { + "name": "internet_search", + "description": "Returns a list of relevant document snippets for a textual query retrieved from the internet", + "parameter_definitions": { + "query": {"description": "Query to search the internet with", "type": "str", "required": True} + }, + }, + { + "name": "directly_answer", + "description": "Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history", + "parameter_definitions": {}, + }, + ] + + tool_use_prompt = tokenizer.apply_tool_use_template( + conversation, + tools=tools, + tokenize=False, + add_generation_prompt=True, + ) + + expected_prompt = '''<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble +The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral. + +# System Preamble +## Basic Rules +You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions. + +# User Preamble +## Task and Context +You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. + +## Style Guide +Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. + +## Available Tools +Here is a list of tools that you have available to you: + +```python +def internet_search(query: str) -> List[Dict]: + """Returns a list of relevant document snippets for a textual query retrieved from the internet + + Args: + query (str): Query to search the internet with + """ + pass +``` + +```python +def directly_answer() -> List[Dict]: + """Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history + """ + pass +```<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example: +```json +[ + { + "tool_name": title of the tool in the specification, + "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters + } +]```<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>''' + + self.assertEqual(tool_use_prompt, expected_prompt) + + @require_jinja + def test_tokenization_for_grounded_generation(self): + tokenizer = self.get_rust_tokenizer() + conversation = [{"role": "user", "content": "Whats the biggest penguin in the world?"}] + + documents = [ + {"title": "Tall penguins", "text": "Emperor penguins are the tallest growing up to 122 cm in height."}, + {"title": "Penguin habitats", "text": "Emperor penguins only live in Antarctica."}, + ] + + grounded_generation_prompt = tokenizer.apply_grounded_generation_template( + conversation, + documents=documents, + citation_mode="accurate", # or "fast" + tokenize=False, + add_generation_prompt=True, + ) + + expected_prompt = """<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble +The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral. + +# System Preamble +## Basic Rules +You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions. + +# User Preamble +## Task and Context +You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. + +## Style Guide +Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|> +Document: 0 +title: Tall penguins +text: Emperor penguins are the tallest growing up to 122 cm in height. + +Document: 1 +title: Penguin habitats +text: Emperor penguins only live in Antarctica. +<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Carefully perform the following instructions, in order, starting each with a new line. +Firstly, Decide which of the retrieved documents are relevant to the user's last input by writing 'Relevant Documents:' followed by comma-separated list of document numbers. If none are relevant, you should instead write 'None'. +Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user's last input by writing 'Cited Documents:' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write 'None'. +Thirdly, Write 'Answer:' followed by a response to the user's last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup. +Finally, Write 'Grounded answer:' followed by a response to the user's last input in high quality natural english. Use the symbols and to indicate when a fact comes from a document in the search result, e.g my fact for a fact from document 0.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>""" + + self.assertEqual(grounded_generation_prompt, expected_prompt) + + def test_add_prefix_space_fast(self): + tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True) + tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False) + tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey") + tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey") + self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix) diff --git a/docs/transformers/tests/models/cohere2/__init__.py b/docs/transformers/tests/models/cohere2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/cohere2/test_modeling_cohere2.py b/docs/transformers/tests/models/cohere2/test_modeling_cohere2.py new file mode 100644 index 0000000000000000000000000000000000000000..63c067df57fc7485faafd73e2122bf2b83af4635 --- /dev/null +++ b/docs/transformers/tests/models/cohere2/test_modeling_cohere2.py @@ -0,0 +1,310 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Cohere2 model.""" + +import unittest + +import pytest +from packaging import version +from parameterized import parameterized +from pytest import mark + +from transformers import AutoModelForCausalLM, AutoTokenizer, Cohere2Config, is_torch_available, pipeline +from transformers.generation.configuration_utils import GenerationConfig +from transformers.testing_utils import ( + require_flash_attn, + require_read_token, + require_torch, + require_torch_large_gpu, + slow, + torch_device, +) + +from ...models.cohere.test_modeling_cohere import CohereModelTest, CohereModelTester +from ...test_configuration_common import ConfigTester + + +if is_torch_available(): + import torch + + from transformers import ( + Cohere2ForCausalLM, + Cohere2Model, + ) + + +class Cohere2ModelTester(CohereModelTester): + config_class = Cohere2Config + if is_torch_available(): + model_class = Cohere2Model + for_causal_lm_class = Cohere2ForCausalLM + + +@require_torch +class Cohere2ModelTest(CohereModelTest, unittest.TestCase): + all_model_classes = (Cohere2Model, Cohere2ForCausalLM) if is_torch_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": Cohere2Model, + "text-generation": Cohere2ForCausalLM, + } + if is_torch_available() + else {} + ) + _is_stateful = True + + def setUp(self): + self.model_tester = Cohere2ModelTester(self) + self.config_tester = ConfigTester(self, config_class=Cohere2Config, hidden_size=37) + + @unittest.skip("Failing because of unique cache (HybridCache)") + def test_model_outputs_equivalence(self, **kwargs): + pass + + @unittest.skip("Cohere2's forcefully disables sdpa due to softcapping") + def test_sdpa_can_dispatch_non_composite_models(self): + pass + + @unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different") + def test_eager_matches_sdpa_generate(self): + pass + + @parameterized.expand([("random",), ("same",)]) + @pytest.mark.generate + @unittest.skip("Cohere2 has HybridCache which is not compatible with assisted decoding") + def test_assisted_decoding_matches_greedy_search(self, assistant_type): + pass + + @unittest.skip("Cohere2 has HybridCache which is not compatible with assisted decoding") + def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type): + pass + + @pytest.mark.generate + @unittest.skip("Cohere2 has HybridCache which is not compatible with assisted decoding") + def test_assisted_decoding_sample(self): + pass + + @unittest.skip("Cohere2 has HybridCache which is not compatible with dola decoding") + def test_dola_decoding_sample(self): + pass + + @unittest.skip("Cohere2 has HybridCache and doesn't support continue from past kv") + def test_generate_continue_from_past_key_values(self): + pass + + @unittest.skip("Cohere2 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate(self): + pass + + @unittest.skip("Cohere2 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip("Cohere2 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip("Cohere2 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") + def test_generate_with_static_cache(self): + pass + + @unittest.skip("Cohere2 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") + def test_generate_from_inputs_embeds_with_static_cache(self): + pass + + @unittest.skip("Cohere2 has HybridCache and doesn't support progressive generation using input embeds.") + def test_generate_continue_from_inputs_embeds(self): + pass + + @unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different") + def test_sdpa_equivalence(self): + pass + + +@slow +@require_read_token +@require_torch_large_gpu +class Cohere2IntegrationTest(unittest.TestCase): + input_text = ["Hello I am doing", "Hi today"] + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + def test_model_bf16(self): + model_id = "CohereForAI/c4ai-command-r7b-12-2024" + EXPECTED_TEXTS = [ + "Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", + "Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager" + ).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=False) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_fp16(self): + model_id = "CohereForAI/c4ai-command-r7b-12-2024" + EXPECTED_TEXTS = [ + "Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", + "Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager" + ).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=False) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_pipeline_bf16(self): + # See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Cohere2 before this PR + model_id = "CohereForAI/c4ai-command-r7b-12-2024" + # EXPECTED_TEXTS should match the same non-pipeline test, minus the special tokens + EXPECTED_TEXTS = [ + "Hello I am doing a project for a school assignment and I need to create a website for a fictional company. I have", + "Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" + ).to(torch_device) + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + + output = pipe(self.input_text, max_new_tokens=20, do_sample=False, padding=True) + + self.assertEqual(output[0][0]["generated_text"], EXPECTED_TEXTS[0]) + self.assertEqual(output[1][0]["generated_text"], EXPECTED_TEXTS[1]) + + @require_flash_attn + @mark.flash_attn_test + def test_model_flash_attn(self): + # See https://github.com/huggingface/transformers/issues/31953 --- flash attn was generating garbage for Gemma2, especially in long context + model_id = "CohereForAI/c4ai-command-r7b-12-2024" + EXPECTED_TEXTS = [ + 'Hello I am doing a project for my school and I need to create a website for a fictional company. I have the logo and the name of the company. I need a website that is simple and easy to navigate. I need a home page, about us, services, contact us, and a gallery. I need the website to be responsive and I need it to be able to be hosted on a server. I need the website to be done in a week. I need the website to be done in HTML,', + "Hi today I'm going to show you how to make a simple and easy to make a chocolate cake.\n\nThis recipe is very simple and easy to make.\n\nYou will need:\n\n* 2 cups of flour\n* 1 cup of sugar\n* 1/2 cup of cocoa powder\n* 1 teaspoon of baking powder\n* 1 teaspoon of baking soda\n* 1/2 teaspoon of salt\n* 2 eggs\n* 1 cup of milk\n", + ] # fmt: skip + + model = AutoModelForCausalLM.from_pretrained( + model_id, attn_implementation="flash_attention_2", torch_dtype="float16" + ).to(torch_device) + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=100, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=False) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_export_static_cache(self): + if version.parse(torch.__version__) < version.parse("2.5.0"): + self.skipTest(reason="This test requires torch >= 2.5 to run.") + + from transformers.integrations.executorch import ( + TorchExportableModuleWithStaticCache, + convert_and_export_with_cache, + ) + + model_id = "CohereForAI/c4ai-command-r7b-12-2024" + EXPECTED_TEXT_COMPLETION = [ + "Hello I am doing a project on the effects of social media on mental health. I have a few questions. 1. What is the relationship", + ] + + tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="", padding_side="right") + # Load model + device = "cpu" + dtype = torch.bfloat16 + cache_implementation = "static" + attn_implementation = "sdpa" + batch_size = 1 + model = AutoModelForCausalLM.from_pretrained( + "CohereForAI/c4ai-command-r7b-12-2024", + device_map=device, + torch_dtype=dtype, + attn_implementation=attn_implementation, + generation_config=GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=30, + cache_config={ + "batch_size": batch_size, + "max_cache_len": 30, + }, + ), + ) + + prompts = ["Hello I am doing"] + prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + prompt_token_ids = prompt_tokens["input_ids"] + max_new_tokens = 30 - prompt_token_ids.shape[-1] + + # Static Cache + export + exported_program = convert_and_export_with_cache(model) + ep_generated_ids = TorchExportableModuleWithStaticCache.generate( + exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens + ) + ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text) + + @parameterized.expand([("flash_attention_2",), ("sdpa",), ("flex_attention",), ("eager",)]) + @require_read_token + def test_generation_beyond_sliding_window(self, attn_implementation: str): + """Test that we can correctly generate beyond the sliding window. This is non trivial as + we need to correctly slice the attention mask in all cases (because we use a HybridCache). + Outputs for every attention functions should be coherent and identical. + """ + model_id = "CohereForAI/c4ai-command-r7b-12-2024" + EXPECTED_COMPLETIONS = [ + " the mountains, the lakes, the rivers, the waterfalls, the waterfalls, the waterfalls, the waterfalls", + ", green, yellow, orange, purple, pink, brown, black, white, grey, silver", + ] + + input_text = [ + "This is a nice place. " * 800 + "I really enjoy the scenery,", # This is larger than 4096 tokens + "A list of colors: red, blue", # This will almost all be padding tokens + ] + tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left") + inputs = tokenizer(input_text, padding=True, return_tensors="pt").to(torch_device) + + model = AutoModelForCausalLM.from_pretrained( + model_id, attn_implementation=attn_implementation, torch_dtype=torch.float16 + ).to(torch_device) + + # Make sure prefill is larger than sliding window + input_size = inputs.input_ids.shape[-1] + self.assertTrue(input_size > model.config.sliding_window) + + out = model.generate(**inputs, max_new_tokens=20)[:, input_size:] + output_text = tokenizer.batch_decode(out) + + self.assertEqual(output_text, EXPECTED_COMPLETIONS) diff --git a/docs/transformers/tests/models/colpali/__init__.py b/docs/transformers/tests/models/colpali/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/colpali/test_modeling_colpali.py b/docs/transformers/tests/models/colpali/test_modeling_colpali.py new file mode 100644 index 0000000000000000000000000000000000000000..c0bcc9177117dca2901dcc7578921969af0136ff --- /dev/null +++ b/docs/transformers/tests/models/colpali/test_modeling_colpali.py @@ -0,0 +1,353 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch ColPali model.""" + +import gc +import unittest +from typing import ClassVar + +import torch +from datasets import load_dataset + +from tests.test_configuration_common import ConfigTester +from tests.test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from transformers import ( + is_torch_available, +) +from transformers.models.colpali.configuration_colpali import ColPaliConfig +from transformers.models.colpali.modeling_colpali import ColPaliForRetrieval, ColPaliForRetrievalOutput +from transformers.models.colpali.processing_colpali import ColPaliProcessor +from transformers.testing_utils import ( + require_torch, + require_vision, + slow, + torch_device, +) + + +if is_torch_available(): + import torch + + +class ColPaliForRetrievalModelTester: + def __init__( + self, + parent, + ignore_index=-100, + image_token_index=0, + projector_hidden_act="gelu", + seq_length=25, + vision_feature_select_strategy="default", + vision_feature_layer=-1, + projection_dim=32, + text_config={ + "model_type": "gemma", + "seq_length": 128, + "is_training": True, + "use_token_type_ids": False, + "use_labels": True, + "vocab_size": 99, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "num_key_value_heads": 1, + "head_dim": 8, + "intermediate_size": 37, + "hidden_activation": "gelu_pytorch_tanh", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 512, + "type_vocab_size": 16, + "type_sequence_label_size": 2, + "initializer_range": 0.02, + "num_labels": 3, + "num_choices": 4, + "pad_token_id": 1, + }, + is_training=False, + vision_config={ + "use_labels": True, + "image_size": 20, + "patch_size": 5, + "num_image_tokens": 4, + "num_channels": 3, + "is_training": True, + "hidden_size": 32, + "projection_dim": 32, + "num_key_value_heads": 1, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "dropout": 0.1, + "attention_dropout": 0.1, + "initializer_range": 0.02, + }, + use_cache=False, + embedding_dim=128, + ): + self.parent = parent + self.ignore_index = ignore_index + # `image_token_index` is set to 0 to pass "resize_embeddings" test, do not modify + self.image_token_index = image_token_index + self.projector_hidden_act = projector_hidden_act + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + self.text_config = text_config + self.vision_config = vision_config + self.seq_length = seq_length + self.projection_dim = projection_dim + self.pad_token_id = text_config["pad_token_id"] + + self.num_hidden_layers = text_config["num_hidden_layers"] + self.vocab_size = text_config["vocab_size"] + self.hidden_size = text_config["hidden_size"] + self.num_attention_heads = text_config["num_attention_heads"] + self.is_training = is_training + + self.batch_size = 3 + self.num_channels = vision_config["num_channels"] + self.image_size = vision_config["image_size"] + self.encoder_seq_length = seq_length + self.use_cache = use_cache + + self.embedding_dim = embedding_dim + self.vlm_config = { + "model_type": "paligemma", + "text_config": self.text_config, + "vision_config": self.vision_config, + "ignore_index": self.ignore_index, + "image_token_index": self.image_token_index, + "projector_hidden_act": self.projector_hidden_act, + "projection_dim": self.projection_dim, + "vision_feature_select_strategy": self.vision_feature_select_strategy, + "vision_feature_layer": self.vision_feature_layer, + } + + def get_config(self): + return ColPaliConfig( + vlm_config=self.vlm_config, + embedding_dim=self.embedding_dim, + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + self.vision_config["num_channels"], + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + config = self.get_config() + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.vlm_config.text_config.vocab_size - 1) + 1 + attention_mask = input_ids.ne(1).to(torch_device) + # set the 16 first tokens to be image, and ensure that no other tokens are image tokens + # do not change this unless you modified image size or patch size + input_ids[input_ids == config.vlm_config.image_token_index] = self.pad_token_id + input_ids[:, :16] = config.vlm_config.image_token_index + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": input_ids, + "token_type_ids": torch.zeros_like(input_ids), + } + return config, inputs_dict + + +@require_torch +class ColPaliForRetrievalModelTest(ModelTesterMixin, unittest.TestCase): + """ + Model tester for `ColPaliForRetrieval`. + """ + + all_model_classes = (ColPaliForRetrieval,) if is_torch_available() else () + fx_compatible = False + test_torchscript = False + test_pruning = False + test_resize_embeddings = True + test_head_masking = False + + def setUp(self): + self.model_tester = ColPaliForRetrievalModelTester(self) + self.config_tester = ConfigTester(self, config_class=ColPaliConfig, has_text_modality=False) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + # while some other models require pixel_values to be present + def test_inputs_embeds_matches_input_ids(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + inputs_embeds = model.get_input_embeddings()(input_ids) + + with torch.no_grad(): + out_ids = model(input_ids=input_ids, **inputs)[0] + out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] + torch.testing.assert_close(out_embeds, out_ids) + + @slow + @require_vision + def test_colpali_forward_inputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + with torch.no_grad(): + outputs = model(**inputs, return_dict=True) + + self.assertIsInstance(outputs, ColPaliForRetrievalOutput) + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip( + reason="From PaliGemma: Some undefined behavior encountered with test versions of this model. Skip for now." + ) + def test_model_parallelism(self): + pass + + @unittest.skip( + reason="PaliGemmma's SigLip encoder uses the same initialization scheme as the Flax original implementation" + ) + def test_initialization(self): + pass + + # TODO extend valid outputs to include this test @Molbap + @unittest.skip(reason="PaliGemma has currently one output format.") + def test_model_outputs_equivalence(self): + pass + + @unittest.skip(reason="Pass because ColPali requires `attention_mask is not None`") + def test_sdpa_can_dispatch_on_flash(self): + pass + + @unittest.skip(reason="Pass because ColPali requires `attention_mask is not None`") + def test_sdpa_can_compile_dynamic(self): + pass + + +@require_torch +class ColPaliModelIntegrationTest(unittest.TestCase): + model_name: ClassVar[str] = "vidore/colpali-v1.2-hf" + + def setUp(self): + self.processor = ColPaliProcessor.from_pretrained(self.model_name) + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + + @slow + def test_model_integration_test(self): + """ + Test if the model is able to retrieve the correct pages for a small and easy dataset. + """ + model = ColPaliForRetrieval.from_pretrained( + self.model_name, + torch_dtype=torch.bfloat16, + device_map=torch_device, + ).eval() + + # Load the test dataset + ds = load_dataset("hf-internal-testing/document-visual-retrieval-test", split="test") + + # Preprocess the examples + batch_images = self.processor(images=ds["image"]).to(torch_device) + batch_queries = self.processor(text=ds["query"]).to(torch_device) + + # Run inference + with torch.inference_mode(): + image_embeddings = model(**batch_images).embeddings + query_embeddings = model(**batch_queries).embeddings + + # Compute retrieval scores + scores = self.processor.score_retrieval( + query_embeddings=query_embeddings, + passage_embeddings=image_embeddings, + ) # (len(qs), len(ps)) + + assert scores.ndim == 2, f"Expected 2D tensor, got {scores.ndim}" + assert scores.shape == (len(ds), len(ds)), f"Expected shape {(len(ds), len(ds))}, got {scores.shape}" + + # Check if the maximum scores per row are in the diagonal of the matrix score + self.assertTrue((scores.argmax(axis=1) == torch.arange(len(ds), device=scores.device)).all()) + + # Further validation: fine-grained check, with a hardcoded score from the original implementation + expected_scores = torch.tensor( + [ + [15.5625, 6.5938, 14.4375], + [12.2500, 16.2500, 11.0000], + [15.0625, 11.7500, 21.0000], + ], + dtype=scores.dtype, + ) + + assert torch.allclose(scores, expected_scores, atol=1), f"Expected scores {expected_scores}, got {scores}" diff --git a/docs/transformers/tests/models/colpali/test_processing_colpali.py b/docs/transformers/tests/models/colpali/test_processing_colpali.py new file mode 100644 index 0000000000000000000000000000000000000000..c2bbdaaa9621bdb7d75dfb17027e36aa76aee21e --- /dev/null +++ b/docs/transformers/tests/models/colpali/test_processing_colpali.py @@ -0,0 +1,248 @@ +import shutil +import tempfile +import unittest + +import torch + +from transformers import GemmaTokenizer +from transformers.models.colpali.processing_colpali import ColPaliProcessor +from transformers.testing_utils import get_tests_dir, require_torch, require_vision +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import ( + ColPaliProcessor, + PaliGemmaProcessor, + SiglipImageProcessor, + ) + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + + +@require_vision +class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = ColPaliProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384") + image_processor.image_seq_length = 0 + tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True) + processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer) + processor.save_pretrained(cls.tmpdirname) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + @require_torch + @require_vision + def test_process_images(self): + # Processor configuration + image_input = self.prepare_image_inputs() + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=112, padding="max_length") + image_processor.image_seq_length = 14 + + # Get the processor + processor = self.processor_class( + tokenizer=tokenizer, + image_processor=image_processor, + ) + + # Process the image + batch_feature = processor.process_images(images=image_input, return_tensors="pt") + + # Assertions + self.assertIn("pixel_values", batch_feature) + self.assertEqual(batch_feature["pixel_values"].shape, torch.Size([1, 3, 384, 384])) + + @require_torch + @require_vision + def test_process_queries(self): + # Inputs + queries = [ + "Is attention really all you need?", + "Are Benjamin, Antoine, Merve, and Jo best friends?", + ] + + # Processor configuration + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=112, padding="max_length") + image_processor.image_seq_length = 14 + + # Get the processor + processor = self.processor_class( + tokenizer=tokenizer, + image_processor=image_processor, + ) + + # Process the image + batch_feature = processor.process_queries(text=queries, return_tensors="pt") + + # Assertions + self.assertIn("input_ids", batch_feature) + self.assertIsInstance(batch_feature["input_ids"], torch.Tensor) + self.assertEqual(batch_feature["input_ids"].shape[0], len(queries)) + + # The following tests are overwritten as ColPaliProcessor can only take one of images or text as input at a time + + def test_tokenizer_defaults_preserved_by_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + input_str = self.prepare_text_inputs() + inputs = processor(text=input_str, return_tensors="pt") + self.assertEqual(inputs[self.text_input_name].shape[-1], 117) + + def test_image_processor_defaults_preserved_by_image_kwargs(self): + """ + We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor. + We then check that the mean of the pixel_values is less than or equal to 0 after processing. + Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied. + """ + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component( + "image_processor", do_rescale=True, rescale_factor=-1 + ) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + image_input = self.prepare_image_inputs() + + inputs = processor(images=image_input, return_tensors="pt") + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + + def test_kwargs_overrides_default_tokenizer_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest") + + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + input_str = self.prepare_text_inputs() + inputs = processor(text=input_str, return_tensors="pt", max_length=112, padding="max_length") + self.assertEqual(inputs[self.text_input_name].shape[-1], 112) + + def test_kwargs_overrides_default_image_processor_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component( + "image_processor", do_rescale=True, rescale_factor=1 + ) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + image_input = self.prepare_image_inputs() + + inputs = processor(images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt") + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + + def test_unstructured_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs() + inputs = processor( + text=input_str, + return_tensors="pt", + do_rescale=True, + rescale_factor=-1, + padding="max_length", + max_length=76, + ) + + self.assertEqual(inputs[self.text_input_name].shape[-1], 76) + + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + image_input = self.prepare_image_inputs(batch_size=2) + inputs = processor( + images=image_input, + return_tensors="pt", + do_rescale=True, + rescale_factor=-1, + padding="longest", + max_length=76, + ) + + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + + def test_doubly_passed_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + image_input = self.prepare_image_inputs() + with self.assertRaises(ValueError): + _ = processor( + images=image_input, + images_kwargs={"do_rescale": True, "rescale_factor": -1}, + do_rescale=True, + return_tensors="pt", + ) + + def test_structured_kwargs_nested(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"do_rescale": True, "rescale_factor": -1}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, **all_kwargs) + self.skip_processor_without_typed_kwargs(processor) + + self.assertEqual(inputs[self.text_input_name].shape[-1], 76) + + def test_structured_kwargs_nested_from_dict(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"do_rescale": True, "rescale_factor": -1}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(images=image_input, **all_kwargs) + self.assertEqual(inputs[self.text_input_name].shape[-1], 76) diff --git a/docs/transformers/tests/models/conditional_detr/__init__.py b/docs/transformers/tests/models/conditional_detr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/docs/transformers/tests/models/conditional_detr/test_image_processing_conditional_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..4b02a1257844904f08c41f09d82918a7577d5a49 --- /dev/null +++ b/docs/transformers/tests/models/conditional_detr/test_image_processing_conditional_detr.py @@ -0,0 +1,607 @@ +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import pathlib +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision, slow +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import ConditionalDetrImageProcessor + + if is_torchvision_available(): + from transformers import ConditionalDetrImageProcessorFast + + +class ConditionalDetrImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_rescale=True, + rescale_factor=1 / 255, + do_pad=True, + ): + # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p + size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_pad = do_pad + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_rescale": self.do_rescale, + "rescale_factor": self.rescale_factor, + "do_pad": self.do_pad, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to ConditionalDetrImageProcessor, + assuming do_resize is set to True with a scalar size. + """ + if not batched: + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] + else: + h, w = image.shape[1], image.shape[2] + if w < h: + expected_height = int(self.size["shortest_edge"] * h / w) + expected_width = self.size["shortest_edge"] + elif w > h: + expected_height = self.size["shortest_edge"] + expected_width = int(self.size["shortest_edge"] * w / h) + else: + expected_height = self.size["shortest_edge"] + expected_width = self.size["shortest_edge"] + + else: + expected_values = [] + for image in image_inputs: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + def expected_output_image_shape(self, images): + height, width = self.get_expected_values(images, batched=True) + return self.num_channels, height, width + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None + fast_image_processing_class = ConditionalDetrImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = ConditionalDetrImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) + self.assertEqual(image_processor.do_pad, True) + + image_processor = image_processing_class.from_dict( + self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False + ) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) + self.assertEqual(image_processor.do_pad, False) + + @slow + def test_call_pytorch_with_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class.from_pretrained("microsoft/conditional-detr-resnet-50") + encoding = image_processing(images=image, annotations=target, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + # verify area + expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) + torch.testing.assert_close(encoding["labels"][0]["area"], expected_area) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) + torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3) + # verify image_id + expected_image_id = torch.tensor([39769]) + torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd) + # verify class_labels + expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) + torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size) + # verify size + expected_size = torch.tensor([800, 1066]) + torch.testing.assert_close(encoding["labels"][0]["size"], expected_size) + + @slow + def test_call_pytorch_with_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class(format="coco_panoptic") + encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + # verify area + expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) + torch.testing.assert_close(encoding["labels"][0]["area"], expected_area) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) + torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3) + # verify image_id + expected_image_id = torch.tensor([39769]) + torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd) + # verify class_labels + expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) + torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels) + # verify masks + expected_masks_sum = 822873 + relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum + self.assertTrue(relative_error < 1e-3) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size) + # verify size + expected_size = torch.tensor([800, 1066]) + torch.testing.assert_close(encoding["labels"][0]["size"], expected_size) + + @slow + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->ConditionalDetr, facebook/detr-resnet-50 ->microsoft/conditional-detr-resnet-50 + def test_batched_coco_detection_annotations(self): + image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800)) + + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + target = json.loads(f.read()) + + annotations_0 = {"image_id": 39769, "annotations": target} + annotations_1 = {"image_id": 39769, "annotations": target} + + # Adjust the bounding boxes for the resized image + w_0, h_0 = image_0.size + w_1, h_1 = image_1.size + for i in range(len(annotations_1["annotations"])): + coords = annotations_1["annotations"][i]["bbox"] + new_bbox = [ + coords[0] * w_1 / w_0, + coords[1] * h_1 / h_0, + coords[2] * w_1 / w_0, + coords[3] * h_1 / h_0, + ] + annotations_1["annotations"][i]["bbox"] = new_bbox + + images = [image_0, image_1] + annotations = [annotations_0, annotations_1] + + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class() + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + return_tensors="pt", # do_convert_annotations=True + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.6879, 0.4609, 0.0755, 0.3691], + [0.2118, 0.3359, 0.2601, 0.1566], + [0.5011, 0.5000, 0.9979, 1.0000], + [0.5010, 0.5020, 0.9979, 0.9959], + [0.3284, 0.5944, 0.5884, 0.8112], + [0.8394, 0.5445, 0.3213, 0.9110], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.4130, 0.2765, 0.0453, 0.2215], + [0.1272, 0.2016, 0.1561, 0.0940], + [0.3757, 0.4933, 0.7488, 0.9865], + [0.3759, 0.5002, 0.7492, 0.9955], + [0.1971, 0.5456, 0.3532, 0.8646], + [0.5790, 0.4115, 0.3430, 0.7161], + ] + ) + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3, rtol=1e-3) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->ConditionalDetr + def test_batched_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800)) + + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f: + target = json.loads(f.read()) + + annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + w_0, h_0 = image_0.size + w_1, h_1 = image_1.size + for i in range(len(annotation_1["segments_info"])): + coords = annotation_1["segments_info"][i]["bbox"] + new_bbox = [ + coords[0] * w_1 / w_0, + coords[1] * h_1 / h_0, + coords[2] * w_1 / w_0, + coords[3] * h_1 / h_0, + ] + annotation_1["segments_info"][i]["bbox"] = new_bbox + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + images = [image_0, image_1] + annotations = [annotation_0, annotation_1] + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class(format="coco_panoptic") + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_tensors="pt", + return_segmentation_masks=True, + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.2625, 0.5437, 0.4688, 0.8625], + [0.7719, 0.4104, 0.4531, 0.7125], + [0.5000, 0.4927, 0.9969, 0.9854], + [0.1688, 0.2000, 0.2063, 0.0917], + [0.5492, 0.2760, 0.0578, 0.2187], + [0.4992, 0.4990, 0.9984, 0.9979], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.1576, 0.3262, 0.2814, 0.5175], + [0.4634, 0.2463, 0.2720, 0.4275], + [0.3002, 0.2956, 0.5985, 0.5913], + [0.1013, 0.1200, 0.1238, 0.0550], + [0.3297, 0.1656, 0.0347, 0.1312], + [0.2997, 0.2994, 0.5994, 0.5987], + ] + ) + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3, rtol=1e-3) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr + def test_max_width_max_height_resizing_and_pad_strategy(self): + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = image_processing_class( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + + def test_longest_edge_shortest_edge_resizing_strategy(self): + image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) + + # max size is set; width < height; + # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 + image_processor = ConditionalDetrImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436])) + + image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) + # max size is set; height < width; + # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 + image_processor = ConditionalDetrImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640])) + + image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) + # max size is set; width == size; height > max_size; + # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 + image_processor = ConditionalDetrImageProcessor( + size={"longest_edge": 118, "shortest_edge": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_3], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118])) + + image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) + # max size is set; height == size; width < max_size; + # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 + image_processor = ConditionalDetrImageProcessor( + size={"longest_edge": 256, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_4], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50])) + + image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) + # max size is set; height == width; width < max_size; + # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 + image_processor = ConditionalDetrImageProcessor( + size={"longest_edge": 117, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_5], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50])) diff --git a/docs/transformers/tests/models/conditional_detr/test_modeling_conditional_detr.py b/docs/transformers/tests/models/conditional_detr/test_modeling_conditional_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..c13079c5fd5b94dea546412e307addc53fd00c22 --- /dev/null +++ b/docs/transformers/tests/models/conditional_detr/test_modeling_conditional_detr.py @@ -0,0 +1,616 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Conditional DETR model.""" + +import inspect +import math +import unittest + +from transformers import ConditionalDetrConfig, ResNetConfig, is_torch_available, is_vision_available +from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + ConditionalDetrForObjectDetection, + ConditionalDetrForSegmentation, + ConditionalDetrModel, + ) + + +if is_vision_available(): + from PIL import Image + + from transformers import ConditionalDetrImageProcessor + + +class ConditionalDetrModelTester: + def __init__( + self, + parent, + batch_size=8, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=8, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_queries=12, + num_channels=3, + min_size=200, + max_size=200, + n_targets=8, + num_labels=91, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_queries = num_queries + self.num_channels = num_channels + self.min_size = min_size + self.max_size = max_size + self.n_targets = n_targets + self.num_labels = num_labels + + # we also set the expected seq length for both encoder and decoder + self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32) + self.decoder_seq_length = self.num_queries + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]) + + pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device) + + labels = None + if self.use_labels: + # labels is a list of Dict (each Dict being the labels for a given example in the batch) + labels = [] + for i in range(self.batch_size): + target = {} + target["class_labels"] = torch.randint( + high=self.num_labels, size=(self.n_targets,), device=torch_device + ) + target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device) + target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device) + labels.append(target) + + config = self.get_config() + return config, pixel_values, pixel_mask, labels + + def get_config(self): + resnet_config = ResNetConfig( + num_channels=3, + embeddings_size=10, + hidden_sizes=[10, 20, 30, 40], + depths=[1, 1, 2, 1], + hidden_act="relu", + num_labels=3, + out_features=["stage2", "stage3", "stage4"], + out_indices=[2, 3, 4], + ) + return ConditionalDetrConfig( + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + num_queries=self.num_queries, + num_labels=self.num_labels, + use_timm_backbone=False, + backbone_config=resnet_config, + backbone=None, + use_pretrained_backbone=False, + ) + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask} + return config, inputs_dict + + def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_mask, labels): + model = ConditionalDetrModel(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size) + ) + + def create_and_check_conditional_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels): + model = ConditionalDetrForObjectDetection(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels) + + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + +@require_torch +class ConditionalDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + ConditionalDetrModel, + ConditionalDetrForObjectDetection, + ConditionalDetrForSegmentation, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + {"image-feature-extraction": ConditionalDetrModel, "object-detection": ConditionalDetrForObjectDetection} + if is_torch_available() + else {} + ) + is_encoder_decoder = True + test_torchscript = False + test_pruning = False + test_head_masking = False + test_missing_keys = False + zero_init_hidden_state = True + test_torch_exportable = True + + # special case for head models + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ in ["ConditionalDetrForObjectDetection", "ConditionalDetrForSegmentation"]: + labels = [] + for i in range(self.model_tester.batch_size): + target = {} + target["class_labels"] = torch.ones( + size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long + ) + target["boxes"] = torch.ones( + self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float + ) + target["masks"] = torch.ones( + self.model_tester.n_targets, + self.model_tester.min_size, + self.model_tester.max_size, + device=torch_device, + dtype=torch.float, + ) + labels.append(target) + inputs_dict["labels"] = labels + + return inputs_dict + + def setUp(self): + self.model_tester = ConditionalDetrModelTester(self) + self.config_tester = ConfigTester(self, config_class=ConditionalDetrConfig, has_text_modality=False) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_conditional_detr_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_conditional_detr_model(*config_and_inputs) + + def test_conditional_detr_object_detection_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_conditional_detr_object_detection_head_model(*config_and_inputs) + + # TODO: check if this works again for PyTorch 2.x.y + @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.") + def test_multi_gpu_data_parallel_forward(self): + pass + + @unittest.skip(reason="Conditional DETR does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Conditional DETR does not use inputs_embeds") + def test_inputs_embeds_matches_input_ids(self): + pass + + @unittest.skip(reason="Conditional DETR does not have a get_input_embeddings method") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="Conditional DETR is not a generative model") + def test_generate_without_input_ids(self): + pass + + @unittest.skip(reason="Conditional DETR does not use token embeddings") + def test_resize_tokens_embeddings(self): + pass + + @slow + @unittest.skip(reason="TODO Niels: fix me!") + def test_model_outputs_equivalence(self): + pass + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + decoder_seq_length = self.model_tester.decoder_seq_length + encoder_seq_length = self.model_tester.encoder_seq_length + decoder_key_length = self.model_tester.decoder_seq_length + encoder_key_length = self.model_tester.encoder_seq_length + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + out_len = len(outputs) + + if self.is_encoder_decoder: + correct_outlen = 6 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Object Detection model returns pred_logits and pred_boxes + if model_class.__name__ == "ConditionalDetrForObjectDetection": + correct_outlen += 1 + # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks + if model_class.__name__ == "ConditionalDetrForSegmentation": + correct_outlen += 2 + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + encoder_key_length, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + added_hidden_states = 2 + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + + def test_retain_grad_hidden_states_attentions(self): + # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + output = outputs[0] + + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_attentions = outputs.encoder_attentions[0] + encoder_hidden_states.retain_grad() + encoder_attentions.retain_grad() + + decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions.retain_grad() + + cross_attentions = outputs.cross_attentions[0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + + def test_forward_auxiliary_loss(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.auxiliary_loss = True + + # only test for object detection and segmentation model + for model_class in self.all_model_classes[1:]: + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + + outputs = model(**inputs) + + self.assertIsNotNone(outputs.auxiliary_outputs) + self.assertEqual(len(outputs.auxiliary_outputs), self.model_tester.num_hidden_layers - 1) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + if model.config.is_encoder_decoder: + expected_arg_names = ["pixel_values", "pixel_mask"] + expected_arg_names.extend( + ["head_mask", "decoder_head_mask", "encoder_outputs"] + if "head_mask" and "decoder_head_mask" in arg_names + else [] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + else: + expected_arg_names = ["pixel_values", "pixel_mask"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_different_timm_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # let's pick a random timm backbone + config.backbone = "tf_mobilenetv3_small_075" + config.backbone_config = None + config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "ConditionalDetrForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + elif model_class.__name__ == "ConditionalDetrForSegmentation": + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + else: + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3) + + self.assertTrue(outputs) + + @require_timm + def test_hf_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Load a pretrained HF checkpoint as backbone + config.backbone = "microsoft/resnet-18" + config.backbone_config = None + config.use_timm_backbone = False + config.use_pretrained_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "ConditionalDetrForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + elif model_class.__name__ == "ConditionalDetrForSegmentation": + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + else: + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3) + + self.assertTrue(outputs) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + configs_no_init.init_xavier_std = 1e9 + + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + if "bbox_attention" in name and "bias" not in name: + self.assertLess( + 100000, + abs(param.data.max().item()), + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + +TOLERANCE = 1e-4 + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_timm +@require_vision +@slow +class ConditionalDetrModelIntegrationTests(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + ConditionalDetrImageProcessor.from_pretrained("microsoft/conditional-detr-resnet-50") + if is_vision_available() + else None + ) + + def test_inference_no_head(self): + model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + + with torch.no_grad(): + outputs = model(**encoding) + + expected_shape = torch.Size((1, 300, 256)) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + expected_slice = torch.tensor( + [[0.4222, 0.7471, 0.8760], [0.6395, -0.2729, 0.7127], [-0.3090, 0.7642, 0.9529]] + ).to(torch_device) + torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + def test_inference_object_detection_head(self): + model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50").to( + torch_device + ) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + # verify logits + box predictions + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + expected_slice_logits = torch.tensor( + [[-10.4372, -5.7558, -8.6764], [-10.5410, -5.8704, -8.0590], [-10.6827, -6.3469, -8.3923]] + ).to(torch_device) + torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, rtol=1e-4, atol=1e-4) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + expected_slice_boxes = torch.tensor( + [[0.7733, 0.6576, 0.4496], [0.5171, 0.1184, 0.9094], [0.8846, 0.5647, 0.2486]] + ).to(torch_device) + torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, rtol=1e-4, atol=1e-4) + + # verify postprocessing + results = image_processor.post_process_object_detection( + outputs, threshold=0.3, target_sizes=[image.size[::-1]] + )[0] + expected_scores = torch.tensor([0.8330, 0.8313, 0.8039, 0.6829, 0.5355]).to(torch_device) + expected_labels = [75, 17, 17, 75, 63] + expected_slice_boxes = torch.tensor([38.3089, 72.1022, 177.6293, 118.4512]).to(torch_device) + + self.assertEqual(len(results["scores"]), 5) + torch.testing.assert_close(results["scores"], expected_scores, rtol=1e-4, atol=1e-4) + self.assertSequenceEqual(results["labels"].tolist(), expected_labels) + torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes) diff --git a/docs/transformers/tests/models/convbert/__init__.py b/docs/transformers/tests/models/convbert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/convbert/test_modeling_convbert.py b/docs/transformers/tests/models/convbert/test_modeling_convbert.py new file mode 100644 index 0000000000000000000000000000000000000000..8aba631c3070a4797e847c05f33667f04ae1f262 --- /dev/null +++ b/docs/transformers/tests/models/convbert/test_modeling_convbert.py @@ -0,0 +1,483 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch ConvBERT model.""" + +import os +import tempfile +import unittest + +from transformers import ConvBertConfig, is_torch_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + MODEL_FOR_QUESTION_ANSWERING_MAPPING, + ConvBertForMaskedLM, + ConvBertForMultipleChoice, + ConvBertForQuestionAnswering, + ConvBertForSequenceClassification, + ConvBertForTokenClassification, + ConvBertModel, + ) + + +class ConvBertModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return ConvBertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + config.is_decoder = True + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ConvBertModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ConvBertForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ConvBertForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = ConvBertForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = ConvBertForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = ConvBertForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class ConvBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + ConvBertModel, + ConvBertForMaskedLM, + ConvBertForMultipleChoice, + ConvBertForQuestionAnswering, + ConvBertForSequenceClassification, + ConvBertForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": ConvBertModel, + "fill-mask": ConvBertForMaskedLM, + "question-answering": ConvBertForQuestionAnswering, + "text-classification": ConvBertForSequenceClassification, + "token-classification": ConvBertForTokenClassification, + "zero-shot": ConvBertForSequenceClassification, + } + if is_torch_available() + else {} + ) + test_pruning = False + test_head_masking = False + + def setUp(self): + self.model_tester = ConvBertModelTester(self) + self.config_tester = ConfigTester(self, config_class=ConvBertConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "YituTech/conv-bert-base" + model = ConvBertModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) + decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) + chunk_length = getattr(self.model_tester, "chunk_length", None) + if chunk_length is not None and hasattr(self.model_tester, "num_hashes"): + encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + if chunk_length is not None: + self.assertListEqual( + list(attentions[0].shape[-4:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, chunk_length, encoder_key_length], + ) + else: + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length], + ) + out_len = len(outputs) + + if self.is_encoder_decoder: + correct_outlen = 5 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Question Answering model returns start_logits and end_logits + if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING): + correct_outlen += 1 # start_logits and end_logits instead of only 1 output + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + encoder_key_length, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + added_hidden_states = 2 + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + if chunk_length is not None: + self.assertListEqual( + list(self_attentions[0].shape[-4:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, chunk_length, encoder_key_length], + ) + else: + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length], + ) + + @slow + @require_torch_accelerator + def test_torchscript_device_change(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + # ConvBertForMultipleChoice behaves incorrectly in JIT environments. + if model_class == ConvBertForMultipleChoice: + self.skipTest(reason="ConvBertForMultipleChoice behaves incorrectly in JIT environments.") + + config.torchscript = True + model = model_class(config=config) + + inputs_dict = self._prepare_for_class(inputs_dict, model_class) + traced_model = torch.jit.trace( + model, (inputs_dict["input_ids"].to("cpu"), inputs_dict["attention_mask"].to("cpu")) + ) + + with tempfile.TemporaryDirectory() as tmp: + torch.jit.save(traced_model, os.path.join(tmp, "traced_model.pt")) + loaded = torch.jit.load(os.path.join(tmp, "traced_model.pt"), map_location=torch_device) + loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device)) + + def test_model_for_input_embeds(self): + batch_size = 2 + seq_length = 10 + inputs_embeds = torch.rand([batch_size, seq_length, 768], device=torch_device) + config = self.model_tester.get_config() + model = ConvBertModel(config=config) + model.to(torch_device) + model.eval() + result = model(inputs_embeds=inputs_embeds) + self.assertEqual(result.last_hidden_state.shape, (batch_size, seq_length, config.hidden_size)) + + def test_reducing_attention_heads(self): + config, *inputs_dict = self.model_tester.prepare_config_and_inputs() + config.head_ratio = 4 + self.model_tester.create_and_check_for_masked_lm(config, *inputs_dict) + + +@require_torch +class ConvBertModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_no_head(self): + model = ConvBertModel.from_pretrained("YituTech/conv-bert-base") + input_ids = torch.tensor([[1, 2, 3, 4, 5, 6]]) + with torch.no_grad(): + output = model(input_ids)[0] + + expected_shape = torch.Size((1, 6, 768)) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor( + [[[-0.0864, -0.4898, -0.3677], [0.1434, -0.2952, -0.7640], [-0.0112, -0.4432, -0.5432]]] + ) + + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/convbert/test_modeling_tf_convbert.py b/docs/transformers/tests/models/convbert/test_modeling_tf_convbert.py new file mode 100644 index 0000000000000000000000000000000000000000..7bd21778eb8b5d945bda718e18db828a47105069 --- /dev/null +++ b/docs/transformers/tests/models/convbert/test_modeling_tf_convbert.py @@ -0,0 +1,424 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import os +import tempfile +import unittest + +from transformers import ConvBertConfig, is_tf_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import ( + TFConvBertForMaskedLM, + TFConvBertForMultipleChoice, + TFConvBertForQuestionAnswering, + TFConvBertForSequenceClassification, + TFConvBertForTokenClassification, + TFConvBertModel, + ) + from transformers.modeling_tf_utils import keras + + +class TFConvBertModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_input_mask = True + self.use_token_type_ids = True + self.use_labels = True + self.vocab_size = 99 + self.hidden_size = 384 + self.num_hidden_layers = 2 + self.num_attention_heads = 4 + self.intermediate_size = 37 + self.hidden_act = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.embedding_size = 128 + self.head_ratio = 2 + self.conv_kernel_size = 9 + self.num_groups = 1 + self.scope = None + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = ConvBertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + return_dict=True, + ) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFConvBertModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + + inputs = [input_ids, input_mask] + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFConvBertForMaskedLM(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFConvBertForSequenceClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = TFConvBertForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFConvBertForTokenClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFConvBertForQuestionAnswering(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFConvBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + TFConvBertModel, + TFConvBertForMaskedLM, + TFConvBertForQuestionAnswering, + TFConvBertForSequenceClassification, + TFConvBertForTokenClassification, + TFConvBertForMultipleChoice, + ) + if is_tf_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": TFConvBertModel, + "fill-mask": TFConvBertForMaskedLM, + "question-answering": TFConvBertForQuestionAnswering, + "text-classification": TFConvBertForSequenceClassification, + "token-classification": TFConvBertForTokenClassification, + "zero-shot": TFConvBertForSequenceClassification, + } + if is_tf_available() + else {} + ) + test_pruning = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFConvBertModelTester(self) + self.config_tester = ConfigTester(self, config_class=ConvBertConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_saved_model_creation_extended(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + if hasattr(config, "use_cache"): + config.use_cache = True + + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) + + for model_class in self.all_model_classes: + class_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + num_out = len(model(class_inputs_dict)) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, saved_model=True) + saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") + model = keras.models.load_model(saved_model_dir) + outputs = model(class_inputs_dict) + + if self.is_encoder_decoder: + output_hidden_states = outputs["encoder_hidden_states"] + output_attentions = outputs["encoder_attentions"] + else: + output_hidden_states = outputs["hidden_states"] + output_attentions = outputs["attentions"] + + self.assertEqual(len(outputs), num_out) + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + + self.assertEqual(len(output_hidden_states), expected_num_layers) + self.assertListEqual( + list(output_hidden_states[0].shape[-2:]), + [self.model_tester.seq_length, self.model_tester.hidden_size], + ) + + self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(output_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length], + ) + + @slow + def test_model_from_pretrained(self): + model = TFConvBertModel.from_pretrained("YituTech/conv-bert-base") + self.assertIsNotNone(model) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", self.model_tester.seq_length) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", self.model_tester.seq_length) + decoder_key_length = getattr(self.model_tester, "key_length", decoder_seq_length) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) + + def check_decoder_attentions_output(outputs): + out_len = len(outputs) + self.assertEqual(out_len % 2, 0) + decoder_attentions = outputs.decoder_attentions + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads / 2, decoder_seq_length, decoder_key_length], + ) + + def check_encoder_attentions_output(outputs): + attentions = [ + t.numpy() for t in (outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions) + ] + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads / 2, encoder_seq_length, encoder_key_length], + ) + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + config.output_hidden_states = False + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + out_len = len(outputs) + self.assertEqual(config.output_hidden_states, False) + check_encoder_attentions_output(outputs) + + if self.is_encoder_decoder: + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(config.output_hidden_states, False) + check_decoder_attentions_output(outputs) + + # Check that output attentions can also be changed via the config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(config.output_hidden_states, False) + check_encoder_attentions_output(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + config.output_hidden_states = True + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) + self.assertEqual(model.config.output_hidden_states, True) + check_encoder_attentions_output(outputs) + + +@require_tf +class TFConvBertModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_masked_lm(self): + model = TFConvBertModel.from_pretrained("YituTech/conv-bert-base") + input_ids = tf.constant([[0, 1, 2, 3, 4, 5]]) + output = model(input_ids)[0] + + expected_shape = [1, 6, 768] + self.assertEqual(output.shape, expected_shape) + + expected_slice = tf.constant( + [ + [ + [-0.03475493, -0.4686034, -0.30638832], + [0.22637248, -0.26988646, -0.7423424], + [0.10324868, -0.45013508, -0.58280784], + ] + ] + ) + tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4) diff --git a/docs/transformers/tests/models/convnext/__init__.py b/docs/transformers/tests/models/convnext/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/convnext/test_image_processing_convnext.py b/docs/transformers/tests/models/convnext/test_image_processing_convnext.py new file mode 100644 index 0000000000000000000000000000000000000000..373eb16fce28d94fa3f71d9aa5b04539803545dd --- /dev/null +++ b/docs/transformers/tests/models/convnext/test_image_processing_convnext.py @@ -0,0 +1,122 @@ +# Copyright 2022s HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_vision_available(): + from transformers import ConvNextImageProcessor + + if is_torchvision_available(): + from transformers import ConvNextImageProcessorFast + + +class ConvNextImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + crop_pct=0.875, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + size = size if size is not None else {"shortest_edge": 20} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.crop_pct = crop_pct + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + "crop_pct": self.crop_pct, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["shortest_edge"], self.size["shortest_edge"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class ConvNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = ConvNextImageProcessor if is_vision_available() else None + fast_image_processing_class = ConvNextImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = ConvNextImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "crop_pct")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 20}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"shortest_edge": 42}) + + @unittest.skip( + "Skipping as ConvNextImageProcessor uses center_crop and center_crop functions are not equivalent for fast and slow processors" + ) + def test_slow_fast_equivalence_batched(self): + pass diff --git a/docs/transformers/tests/models/convnext/test_modeling_convnext.py b/docs/transformers/tests/models/convnext/test_modeling_convnext.py new file mode 100644 index 0000000000000000000000000000000000000000..fce8f4a35b41eef31fff2b7eb7674c0d8038c13e --- /dev/null +++ b/docs/transformers/tests/models/convnext/test_modeling_convnext.py @@ -0,0 +1,302 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch ConvNext model.""" + +import unittest + +from transformers import ConvNextConfig +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_backbone_common import BackboneTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ConvNextBackbone, ConvNextForImageClassification, ConvNextModel + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class ConvNextModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + num_channels=3, + num_stages=4, + hidden_sizes=[10, 20, 30, 40], + depths=[2, 2, 3, 2], + is_training=True, + use_labels=True, + intermediate_size=37, + hidden_act="gelu", + num_labels=10, + initializer_range=0.02, + out_features=["stage2", "stage3", "stage4"], + out_indices=[2, 3, 4], + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.num_stages = num_stages + self.hidden_sizes = hidden_sizes + self.depths = depths + self.is_training = is_training + self.use_labels = use_labels + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.num_labels = num_labels + self.initializer_range = initializer_range + self.out_features = out_features + self.out_indices = out_indices + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.num_labels) + + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return ConvNextConfig( + num_channels=self.num_channels, + hidden_sizes=self.hidden_sizes, + depths=self.depths, + num_stages=self.num_stages, + hidden_act=self.hidden_act, + is_decoder=False, + initializer_range=self.initializer_range, + out_features=self.out_features, + out_indices=self.out_indices, + num_labels=self.num_labels, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = ConvNextModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + # expected last hidden states: B, C, H // 32, W // 32 + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32), + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + model = ConvNextForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_backbone(self, config, pixel_values, labels): + model = ConvNextBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify hidden states + self.parent.assertEqual(len(result.feature_maps), len(config.out_features)) + self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4]) + + # verify channels + self.parent.assertEqual(len(model.channels), len(config.out_features)) + self.parent.assertListEqual(model.channels, config.hidden_sizes[1:]) + + # verify backbone works with out_features=None + config.out_features = None + model = ConvNextBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), 1) + self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[-1], 1, 1]) + + # verify channels + self.parent.assertEqual(len(model.channels), 1) + self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]]) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class ConvNextModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + ( + ConvNextModel, + ConvNextForImageClassification, + ConvNextBackbone, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + {"image-feature-extraction": ConvNextModel, "image-classification": ConvNextForImageClassification} + if is_torch_available() + else {} + ) + + fx_compatible = True + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = ConvNextModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=ConvNextConfig, + has_text_modality=False, + hidden_size=37, + common_properties=["num_channels", "hidden_sizes"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="ConvNext does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="ConvNext does not support input and output embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="ConvNext does not use feedforward chunking") + def test_feed_forward_chunking(self): + pass + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_backbone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_backbone(*config_and_inputs) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_stages = self.model_tester.num_stages + self.assertEqual(len(hidden_states), expected_num_stages + 1) + + # ConvNext's feature maps are of shape (batch_size, num_channels, height, width) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [self.model_tester.image_size // 4, self.model_tester.image_size // 4], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/convnext-tiny-224" + model = ConvNextModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class ConvNextModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None + + @slow + def test_inference_image_classification_head(self): + model = ConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor([-0.0260, -0.4739, 0.1911]).to(torch_device) + + torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + +@require_torch +class ConvNextBackboneTest(unittest.TestCase, BackboneTesterMixin): + all_model_classes = (ConvNextBackbone,) if is_torch_available() else () + config_class = ConvNextConfig + + has_attentions = False + + def setUp(self): + self.model_tester = ConvNextModelTester(self) diff --git a/docs/transformers/tests/models/convnext/test_modeling_tf_convnext.py b/docs/transformers/tests/models/convnext/test_modeling_tf_convnext.py new file mode 100644 index 0000000000000000000000000000000000000000..1e46e57fb25dff696e18839670caf95357d80257 --- /dev/null +++ b/docs/transformers/tests/models/convnext/test_modeling_tf_convnext.py @@ -0,0 +1,300 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the TensorFlow ConvNext model.""" + +from __future__ import annotations + +import inspect +import unittest + +from transformers import ConvNextConfig +from transformers.testing_utils import require_tf, require_vision, slow +from transformers.utils import cached_property, is_tf_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import TFConvNextForImageClassification, TFConvNextModel + + +if is_vision_available(): + from PIL import Image + + from transformers import ConvNextImageProcessor + + +class TFConvNextModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + num_channels=3, + num_stages=4, + hidden_sizes=[10, 20, 30, 40], + depths=[2, 2, 3, 2], + is_training=True, + use_labels=True, + intermediate_size=37, + hidden_act="gelu", + type_sequence_label_size=10, + initializer_range=0.02, + num_labels=3, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.num_stages = num_stages + self.hidden_sizes = hidden_sizes + self.depths = depths + self.is_training = is_training + self.use_labels = use_labels + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return ConvNextConfig( + num_channels=self.num_channels, + hidden_sizes=self.hidden_sizes, + depths=self.depths, + num_stages=self.num_stages, + hidden_act=self.hidden_act, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = TFConvNextModel(config=config) + result = model(pixel_values, training=False) + # expected last hidden states: B, C, H // 32, W // 32 + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32), + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = TFConvNextForImageClassification(config) + result = model(pixel_values, labels=labels, training=False) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class TFConvNextModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (TFConvNextModel, TFConvNextForImageClassification) if is_tf_available() else () + pipeline_model_mapping = ( + {"feature-extraction": TFConvNextModel, "image-classification": TFConvNextForImageClassification} + if is_tf_available() + else {} + ) + + test_pruning = False + test_onnx = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + + def setUp(self): + self.model_tester = TFConvNextModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=ConvNextConfig, + has_text_modality=False, + hidden_size=37, + ) + + @unittest.skip(reason="ConvNext does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + @slow + def test_keras_fit(self): + super().test_keras_fit() + + @unittest.skip(reason="ConvNext does not support input and output embeddings") + def test_model_common_attributes(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + def test_dataset_conversion(self): + super().test_dataset_conversion() + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_stages = self.model_tester.num_stages + self.assertEqual(len(hidden_states), expected_num_stages + 1) + + # ConvNext's feature maps are of shape (batch_size, num_channels, height, width) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [self.model_tester.image_size // 4, self.model_tester.image_size // 4], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + # Since ConvNext does not have any attention we need to rewrite this test. + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (list, tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + all(tf.equal(tuple_object, dict_object)), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}" + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model = TFConvNextModel.from_pretrained("facebook/convnext-tiny-224") + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_tf +@require_vision +class TFConvNextModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ConvNextImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None + + @slow + def test_inference_image_classification_head(self): + model = TFConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224") + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="tf") + + # forward pass + outputs = model(**inputs) + + # verify the logits + expected_shape = tf.TensorShape((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = tf.constant([-0.0260, -0.4739, 0.1911]) + + tf.debugging.assert_near(outputs.logits[0, :3], expected_slice, atol=1e-4) diff --git a/docs/transformers/tests/models/convnextv2/__init__.py b/docs/transformers/tests/models/convnextv2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/convnextv2/test_modeling_convnextv2.py b/docs/transformers/tests/models/convnextv2/test_modeling_convnextv2.py new file mode 100644 index 0000000000000000000000000000000000000000..f24b33c439fba94c72c91a355b1575fb8503424f --- /dev/null +++ b/docs/transformers/tests/models/convnextv2/test_modeling_convnextv2.py @@ -0,0 +1,310 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch ConvNextV2 model.""" + +import unittest + +from transformers import ConvNextV2Config +from transformers.models.auto import get_values +from transformers.models.auto.modeling_auto import MODEL_FOR_BACKBONE_MAPPING_NAMES, MODEL_MAPPING_NAMES +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ConvNextV2Backbone, ConvNextV2ForImageClassification, ConvNextV2Model + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class ConvNextV2ModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + num_channels=3, + num_stages=4, + hidden_sizes=[10, 20, 30, 40], + depths=[2, 2, 3, 2], + is_training=True, + use_labels=True, + intermediate_size=37, + hidden_act="gelu", + num_labels=10, + initializer_range=0.02, + out_features=["stage2", "stage3", "stage4"], + out_indices=[2, 3, 4], + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.num_stages = num_stages + self.hidden_sizes = hidden_sizes + self.depths = depths + self.is_training = is_training + self.use_labels = use_labels + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.num_labels = num_labels + self.initializer_range = initializer_range + self.out_features = out_features + self.out_indices = out_indices + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return ConvNextV2Config( + num_channels=self.num_channels, + hidden_sizes=self.hidden_sizes, + depths=self.depths, + num_stages=self.num_stages, + hidden_act=self.hidden_act, + is_decoder=False, + initializer_range=self.initializer_range, + out_features=self.out_features, + out_indices=self.out_indices, + num_labels=self.num_labels, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = ConvNextV2Model(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + # expected last hidden states: B, C, H // 32, W // 32 + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32), + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + model = ConvNextV2ForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + def prepare_config_and_inputs_with_labels(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values, "labels": labels} + return config, inputs_dict + + +@require_torch +class ConvNextV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as ConvNextV2 does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + ( + ConvNextV2Model, + ConvNextV2ForImageClassification, + ConvNextV2Backbone, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + {"image-feature-extraction": ConvNextV2Model, "image-classification": ConvNextV2ForImageClassification} + if is_torch_available() + else {} + ) + + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = ConvNextV2ModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=ConvNextV2Config, + has_text_modality=False, + hidden_size=37, + common_properties=["hidden_sizes", "num_channels"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="ConvNextV2 does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="ConvNextV2 does not support input and output embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="ConvNextV2 does not use feedforward chunking") + def test_feed_forward_chunking(self): + pass + + def test_training(self): + if not self.model_tester.is_training: + self.skipTest(reason="ModelTester is not set to test training") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_with_labels() + config.return_dict = True + + if model_class.__name__ in [ + *get_values(MODEL_MAPPING_NAMES), + *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES), + ]: + continue + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + if not self.model_tester.is_training: + self.skipTest(reason="ModelTester is not set to test training") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_with_labels() + config.use_cache = False + config.return_dict = True + + if ( + model_class.__name__ + in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)] + or not model_class.supports_gradient_checkpointing + ): + continue + + model = model_class(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_stages = self.model_tester.num_stages + self.assertEqual(len(hidden_states), expected_num_stages + 1) + + # ConvNextV2's feature maps are of shape (batch_size, num_channels, height, width) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [self.model_tester.image_size // 4, self.model_tester.image_size // 4], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/convnextv2-tiny-1k-224" + model = ConvNextV2Model.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class ConvNextV2ModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224") if is_vision_available() else None + + @slow + def test_inference_image_classification_head(self): + model = ConvNextV2ForImageClassification.from_pretrained("facebook/convnextv2-tiny-1k-224").to(torch_device) + + preprocessor = self.default_image_processor + image = prepare_img() + inputs = preprocessor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor([0.9996, 0.1966, -0.4386]).to(torch_device) + torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/convnextv2/test_modeling_tf_convnextv2.py b/docs/transformers/tests/models/convnextv2/test_modeling_tf_convnextv2.py new file mode 100644 index 0000000000000000000000000000000000000000..08e458609c7612dcf9ca895216a202d12411e455 --- /dev/null +++ b/docs/transformers/tests/models/convnextv2/test_modeling_tf_convnextv2.py @@ -0,0 +1,306 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the TensorFlow ConvNext model.""" + +from __future__ import annotations + +import inspect +import unittest + +import numpy as np + +from transformers import ConvNextV2Config +from transformers.testing_utils import require_tf, require_vision, slow +from transformers.utils import cached_property, is_tf_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import TFConvNextV2ForImageClassification, TFConvNextV2Model + + +if is_vision_available(): + from PIL import Image + + from transformers import ConvNextImageProcessor + + +class TFConvNextV2ModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + num_channels=3, + num_stages=4, + hidden_sizes=[10, 20, 30, 40], + depths=[2, 2, 3, 2], + is_training=True, + use_labels=True, + intermediate_size=37, + hidden_act="gelu", + type_sequence_label_size=10, + initializer_range=0.02, + num_labels=3, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.num_stages = num_stages + self.hidden_sizes = hidden_sizes + self.depths = depths + self.is_training = is_training + self.use_labels = use_labels + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return ConvNextV2Config( + num_channels=self.num_channels, + hidden_sizes=self.hidden_sizes, + depths=self.depths, + num_stages=self.num_stages, + hidden_act=self.hidden_act, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = TFConvNextV2Model(config=config) + result = model(pixel_values, training=False) + # expected last hidden states: batch_size, channels, height // 32, width // 32 + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32), + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = TFConvNextV2ForImageClassification(config) + result = model(pixel_values, labels=labels, training=False) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class TFConvNextV2ModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (TFConvNextV2Model, TFConvNextV2ForImageClassification) if is_tf_available() else () + pipeline_model_mapping = ( + {"feature-extraction": TFConvNextV2Model, "image-classification": TFConvNextV2ForImageClassification} + if is_tf_available() + else {} + ) + + test_pruning = False + test_onnx = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + + def setUp(self): + self.model_tester = TFConvNextV2ModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=ConvNextV2Config, + has_text_modality=False, + hidden_size=37, + ) + + @unittest.skip(reason="ConvNext does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + @slow + def test_keras_fit(self): + super().test_keras_fit() + + @unittest.skip(reason="ConvNext does not support input and output embeddings") + def test_model_common_attributes(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + def test_dataset_conversion(self): + super().test_dataset_conversion() + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_stages = self.model_tester.num_stages + self.assertEqual(len(hidden_states), expected_num_stages + 1) + + # ConvNext's feature maps are of shape (batch_size, num_channels, height, width) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [self.model_tester.image_size // 4, self.model_tester.image_size // 4], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + # Since ConvNext does not have any attention we need to rewrite this test. + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (list, tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + all(tf.equal(tuple_object, dict_object)), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}" + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model = TFConvNextV2Model.from_pretrained("facebook/convnextv2-tiny-1k-224") + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_tf +@require_vision +class TFConvNextV2ModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + ConvNextImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224") + if is_vision_available() + else None + ) + + @slow + def test_inference_image_classification_head(self): + model = TFConvNextV2ForImageClassification.from_pretrained("facebook/convnextv2-tiny-1k-224") + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="tf") + + # forward pass + outputs = model(**inputs) + + # verify the logits + expected_shape = tf.TensorShape((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = np.array([0.9996, 0.1966, -0.4386]) + + self.assertTrue(np.allclose(outputs.logits[0, :3].numpy(), expected_slice, atol=1e-4)) diff --git a/docs/transformers/tests/models/cpm/__init__.py b/docs/transformers/tests/models/cpm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/cpm/test_tokenization_cpm.py b/docs/transformers/tests/models/cpm/test_tokenization_cpm.py new file mode 100644 index 0000000000000000000000000000000000000000..2719e01b32e8ac92709dbd0bc3919eabcec8bb49 --- /dev/null +++ b/docs/transformers/tests/models/cpm/test_tokenization_cpm.py @@ -0,0 +1,50 @@ +# Copyright 2018 HuggingFace Inc. team. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers.models.cpm.tokenization_cpm import CpmTokenizer +from transformers.testing_utils import custom_tokenizers + + +@custom_tokenizers +class CpmTokenizationTest(unittest.TestCase): + # There is no `CpmModel` + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + return True + + def test_pre_tokenization(self): + tokenizer = CpmTokenizer.from_pretrained("TsinghuaAI/CPM-Generate") + text = "Hugging Face大法好,谁用谁知道。" + normalized_text = "Hugging Face大法好,谁用谁知道。" + bpe_tokens = "▁Hu gg ing ▁ ▂ ▁F ace ▁大法 ▁好 ▁ , ▁谁 ▁用 ▁谁 ▁知 道 ▁ 。".split() + + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [tokenizer.unk_token] + + input_bpe_tokens = [13789, 13283, 1421, 8, 10, 1164, 13608, 16528, 63, 8, 9, 440, 108, 440, 121, 90, 8, 12, 0] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + reconstructed_text = tokenizer.decode(input_bpe_tokens) + self.assertEqual(reconstructed_text, normalized_text) diff --git a/docs/transformers/tests/models/cpmant/__init__.py b/docs/transformers/tests/models/cpmant/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/cpmant/test_modeling_cpmant.py b/docs/transformers/tests/models/cpmant/test_modeling_cpmant.py new file mode 100644 index 0000000000000000000000000000000000000000..bf3a655c8ddee193acfa0b6a778895fdbcbe0528 --- /dev/null +++ b/docs/transformers/tests/models/cpmant/test_modeling_cpmant.py @@ -0,0 +1,233 @@ +# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch CPMAnt model.""" + +import unittest + +from transformers.testing_utils import is_torch_available, require_torch, tooslow + +from ...generation.test_utils import torch_device +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + CpmAntConfig, + CpmAntForCausalLM, + CpmAntModel, + CpmAntTokenizer, + ) + + +@require_torch +class CpmAntModelTester: + def __init__( + self, + parent, + batch_size=2, + seq_length=8, + is_training=True, + use_token_type_ids=False, + use_input_mask=False, + use_labels=False, + use_mc_token_ids=False, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + num_buckets=32, + max_distance=128, + prompt_length=8, + prompt_types=8, + segment_types=8, + init_std=0.02, + return_dict=True, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.use_mc_token_ids = use_mc_token_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.num_buckets = num_buckets + self.max_distance = max_distance + self.prompt_length = prompt_length + self.prompt_types = prompt_types + self.segment_types = segment_types + self.init_std = init_std + self.return_dict = return_dict + + def prepare_config_and_inputs(self): + input_ids = {} + input_ids["input_ids"] = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).type(torch.int32) + input_ids["use_cache"] = False + + config = self.get_config() + + return (config, input_ids) + + def get_config(self): + return CpmAntConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + dim_ff=self.intermediate_size, + position_bias_num_buckets=self.num_buckets, + position_bias_max_distance=self.max_distance, + prompt_types=self.prompt_types, + prompt_length=self.prompt_length, + segment_types=self.segment_types, + use_cache=True, + init_std=self.init_std, + return_dict=self.return_dict, + ) + + def create_and_check_cpmant_model(self, config, input_ids, *args): + model = CpmAntModel(config=config) + model.to(torch_device) + model.eval() + + hidden_states = model(**input_ids).last_hidden_state + + self.parent.assertEqual(hidden_states.shape, (self.batch_size, self.seq_length, config.hidden_size)) + + def create_and_check_lm_head_model(self, config, input_ids, *args): + model = CpmAntForCausalLM(config) + model.to(torch_device) + input_ids["input_ids"] = input_ids["input_ids"].to(torch_device) + model.eval() + + model_output = model(**input_ids) + self.parent.assertEqual( + model_output.logits.shape, + (self.batch_size, self.seq_length, config.vocab_size + config.prompt_types * config.prompt_length), + ) + + def prepare_config_and_inputs_for_common(self): + config, inputs_dict = self.prepare_config_and_inputs() + return config, inputs_dict + + +@require_torch +class CpmAntModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (CpmAntModel, CpmAntForCausalLM) if is_torch_available() else () + # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante + all_generative_model_classes = () + pipeline_model_mapping = ( + {"feature-extraction": CpmAntModel, "text-generation": CpmAntForCausalLM} if is_torch_available() else {} + ) + + test_pruning = False + test_missing_keys = False + test_mismatched_shapes = False + test_head_masking = False + test_resize_embeddings = False + + def setUp(self): + self.model_tester = CpmAntModelTester(self) + self.config_tester = ConfigTester(self, config_class=CpmAntConfig) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_inputs_embeds(self): + unittest.skip(reason="CPMAnt doesn't support input_embeds.")(self.test_inputs_embeds) + + def test_retain_grad_hidden_states_attentions(self): + unittest.skip( + "CPMAnt doesn't support retain grad in hidden_states or attentions, because prompt management will peel off the output.hidden_states from graph.\ + So is attentions. We strongly recommend you use loss to tune model." + )(self.test_retain_grad_hidden_states_attentions) + + def test_cpmant_model(self): + config, inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_cpmant_model(config, inputs) + + def test_cpmant_lm_head_model(self): + config, inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(config, inputs) + + +@require_torch +class CpmAntModelIntegrationTest(unittest.TestCase): + @tooslow + def test_inference_masked_lm(self): + texts = "今天天气真好!" + model_path = "openbmb/cpm-ant-10b" + model = CpmAntModel.from_pretrained(model_path) + tokenizer = CpmAntTokenizer.from_pretrained(model_path) + inputs = tokenizer(texts, return_tensors="pt") + hidden_states = model(**inputs).last_hidden_state + + expected_slice = torch.tensor( + [[[6.1708, 5.9244, 1.0835], [6.5207, 6.2893, -11.3324], [-1.0107, -0.0576, -5.9577]]], + ) + torch.testing.assert_close(hidden_states[:, :3, :3], expected_slice, rtol=1e-2, atol=1e-2) + + +@require_torch +class CpmAntForCausalLMlIntegrationTest(unittest.TestCase): + @tooslow + def test_inference_casual(self): + texts = "今天天气真好!" + model_path = "openbmb/cpm-ant-10b" + model = CpmAntForCausalLM.from_pretrained(model_path) + tokenizer = CpmAntTokenizer.from_pretrained(model_path) + inputs = tokenizer(texts, return_tensors="pt") + hidden_states = model(**inputs).logits + + expected_slice = torch.tensor( + [[[-6.4267, -6.4083, -6.3958], [-5.8802, -5.9447, -5.7811], [-5.3896, -5.4820, -5.4295]]], + ) + torch.testing.assert_close(hidden_states[:, :3, :3], expected_slice, rtol=1e-2, atol=1e-2) + + @tooslow + def test_simple_generation(self): + model_path = "openbmb/cpm-ant-10b" + model = CpmAntForCausalLM.from_pretrained(model_path) + tokenizer = CpmAntTokenizer.from_pretrained(model_path) + texts = "今天天气不错," + expected_output = "今天天气不错,阳光明媚,我和妈妈一起去超市买东西。\n在超市里,我看到了一个很好玩的玩具,它的名字叫“机器人”。它有一个圆圆的脑袋,两只圆圆的眼睛,还有一个圆圆的" + model_inputs = tokenizer(texts, return_tensors="pt") + token_ids = model.generate(**model_inputs) + output_texts = tokenizer.batch_decode(token_ids) + self.assertEqual(expected_output, output_texts) + + @tooslow + def test_batch_generation(self): + model_path = "openbmb/cpm-ant-10b" + model = CpmAntForCausalLM.from_pretrained(model_path) + tokenizer = CpmAntTokenizer.from_pretrained(model_path) + texts = ["今天天气不错,", "新年快乐,万事如意!"] + expected_output = [ + "今天天气不错,阳光明媚,我和妈妈一起去超市买东西。\n在超市里,我看到了一个很好玩的玩具,它的名字叫“机器人”。它有一个圆圆的脑袋,两只圆圆的眼睛,还有一个圆圆的", + "新年快乐,万事如意!在这辞旧迎新的美好时刻,我谨代表《农村新技术》杂志社全体同仁,向一直以来关心、支持《农村新技术》杂志发展的各级领导、各界朋友和广大读者致以最诚挚的", + ] + model_inputs = tokenizer(texts, return_tensors="pt", padding=True) + token_ids = model.generate(**model_inputs) + output_texts = tokenizer.batch_decode(token_ids) + self.assertEqual(expected_output, output_texts) diff --git a/docs/transformers/tests/models/cpmant/test_tokenization_cpmant.py b/docs/transformers/tests/models/cpmant/test_tokenization_cpmant.py new file mode 100644 index 0000000000000000000000000000000000000000..c7c3209f2d669f39c84cbb349abd59987544790c --- /dev/null +++ b/docs/transformers/tests/models/cpmant/test_tokenization_cpmant.py @@ -0,0 +1,70 @@ +# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +from transformers.models.cpmant.tokenization_cpmant import VOCAB_FILES_NAMES, CpmAntTokenizer +from transformers.testing_utils import require_jieba, tooslow + +from ...test_tokenization_common import TokenizerTesterMixin + + +@require_jieba +class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openbmb/cpm-ant-10b" + tokenizer_class = CpmAntTokenizer + test_rust_tokenizer = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + + vocab_tokens = [ + "", + "", + "", + "", + "", + "", + "", + "", + "我", + "是", + "C", + "P", + "M", + "A", + "n", + "t", + ] + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + @tooslow + def test_pre_tokenization(self): + tokenizer = CpmAntTokenizer.from_pretrained("openbmb/cpm-ant-10b") + texts = "今天天气真好!" + jieba_tokens = ["今天", "天气", "真", "好", "!"] + tokens = tokenizer.tokenize(texts) + self.assertListEqual(tokens, jieba_tokens) + normalized_text = "今天天气真好!" + input_tokens = [tokenizer.bos_token] + tokens + + input_jieba_tokens = [6, 9802, 14962, 2082, 831, 244] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_jieba_tokens) + + reconstructed_text = tokenizer.decode(input_jieba_tokens) + self.assertEqual(reconstructed_text, normalized_text) diff --git a/docs/transformers/tests/models/ctrl/__init__.py b/docs/transformers/tests/models/ctrl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/ctrl/test_modeling_ctrl.py b/docs/transformers/tests/models/ctrl/test_modeling_ctrl.py new file mode 100644 index 0000000000000000000000000000000000000000..860693b5ccdf070f402792fc916e660f3af9d8b7 --- /dev/null +++ b/docs/transformers/tests/models/ctrl/test_modeling_ctrl.py @@ -0,0 +1,284 @@ +# Copyright 2018 Salesforce and HuggingFace Inc. team. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import CTRLConfig, is_torch_available +from transformers.testing_utils import cleanup, require_torch, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + CTRLForSequenceClassification, + CTRLLMHeadModel, + CTRLModel, + ) + + +class CTRLModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.use_mc_token_ids = use_mc_token_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.pad_token_id = self.vocab_size - 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def get_config(self): + return CTRLConfig( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + dff=self.intermediate_size, + # hidden_act=self.hidden_act, + # hidden_dropout_prob=self.hidden_dropout_prob, + # attention_probs_dropout_prob=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + # type_vocab_size=self.type_vocab_size, + # initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + ) + + def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = CTRLModel(config=config) + model.to(torch_device) + model.eval() + + model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(len(result.past_key_values), config.n_layer) + + def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = CTRLLMHeadModel(config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask} + + return config, inputs_dict + + +@require_torch +class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (CTRLModel, CTRLLMHeadModel, CTRLForSequenceClassification) if is_torch_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": CTRLModel, + "text-classification": CTRLForSequenceClassification, + "text-generation": CTRLLMHeadModel, + "zero-shot": CTRLForSequenceClassification, + } + if is_torch_available() + else {} + ) + test_pruning = True + test_resize_embeddings = False + test_head_masking = False + + # TODO: Fix the failed tests + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + if pipeline_test_case_name == "ZeroShotClassificationPipelineTests": + # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers. + # `CTRLConfig` was never used in pipeline tests, either because of a missing checkpoint or because a tiny + # config could not be created. + return True + + return False + + def setUp(self): + self.model_tester = CTRLModelTester(self) + self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37) + + def tearDown(self): + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_ctrl_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_ctrl_model(*config_and_inputs) + + def test_ctrl_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "Salesforce/ctrl" + model = CTRLModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_torch +class CTRLModelLanguageGenerationTest(unittest.TestCase): + def tearDown(self): + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device, gc_collect=True) + + @slow + def test_lm_generate_ctrl(self): + model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl") + model.to(torch_device) + input_ids = torch.tensor( + [[11859, 0, 1611, 8]], dtype=torch.long, device=torch_device + ) # Legal the president is + expected_output_ids = [ + 11859, + 0, + 1611, + 8, + 5, + 150, + 26449, + 2, + 19, + 348, + 469, + 3, + 2595, + 48, + 20740, + 246533, + 246533, + 19, + 30, + 5, + ] # Legal the president is a good guy and I don't want to lose my job. \n \n I have a + + output_ids = model.generate(input_ids, do_sample=False) + self.assertListEqual(output_ids[0].tolist(), expected_output_ids) diff --git a/docs/transformers/tests/models/ctrl/test_modeling_tf_ctrl.py b/docs/transformers/tests/models/ctrl/test_modeling_tf_ctrl.py new file mode 100644 index 0000000000000000000000000000000000000000..38623d442a3e04b8bfb6154fb3805f5d5770afe8 --- /dev/null +++ b/docs/transformers/tests/models/ctrl/test_modeling_tf_ctrl.py @@ -0,0 +1,292 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import unittest + +from transformers import CTRLConfig, is_tf_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers.modeling_tf_utils import keras + from transformers.models.ctrl.modeling_tf_ctrl import ( + TFCTRLForSequenceClassification, + TFCTRLLMHeadModel, + TFCTRLModel, + ) + + +class TFCTRLModelTester: + def __init__( + self, + parent, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_token_type_ids = True + self.use_input_mask = True + self.use_labels = True + self.use_mc_token_ids = True + self.vocab_size = 99 + self.hidden_size = 32 + self.num_hidden_layers = 2 + self.num_attention_heads = 4 + self.intermediate_size = 37 + self.hidden_act = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.scope = None + self.pad_token_id = self.vocab_size - 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = CTRLConfig( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + dff=self.intermediate_size, + # hidden_act=self.hidden_act, + # hidden_dropout_prob=self.hidden_dropout_prob, + # attention_probs_dropout_prob=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + # type_vocab_size=self.type_vocab_size, + # initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + ) + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = TFCTRLModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + + inputs = [input_ids, None, input_mask] # None is the input for 'past' + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = TFCTRLLMHeadModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_ctrl_for_sequence_classification( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + config.num_labels = self.num_labels + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + inputs = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "labels": sequence_labels, + } + model = TFCTRLForSequenceClassification(config) + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFCTRLModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel, TFCTRLForSequenceClassification) if is_tf_available() else () + all_generative_model_classes = (TFCTRLLMHeadModel,) if is_tf_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": TFCTRLModel, + "text-classification": TFCTRLForSequenceClassification, + "text-generation": TFCTRLLMHeadModel, + "zero-shot": TFCTRLForSequenceClassification, + } + if is_tf_available() + else {} + ) + test_head_masking = False + test_onnx = False + + # TODO: Fix the failed tests + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + if pipeline_test_case_name == "ZeroShotClassificationPipelineTests": + # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers. + # `CTRLConfig` was never used in pipeline tests, either because of a missing checkpoint or because a tiny + # config could not be created. + return True + + return False + + def setUp(self): + self.model_tester = TFCTRLModelTester(self) + self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_ctrl_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_ctrl_model(*config_and_inputs) + + def test_ctrl_lm_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_ctrl_lm_head(*config_and_inputs) + + def test_ctrl_sequence_classification_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_ctrl_for_sequence_classification(*config_and_inputs) + + def test_model_common_attributes(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + list_lm_models = [TFCTRLLMHeadModel] + list_other_models_with_output_ebd = [TFCTRLForSequenceClassification] + + for model_class in self.all_model_classes: + model = model_class(config) + model.build_in_name_scope() # may be needed for the get_bias() call below + assert isinstance(model.get_input_embeddings(), keras.layers.Layer) + + if model_class in list_lm_models: + x = model.get_output_embeddings() + assert isinstance(x, keras.layers.Layer) + name = model.get_bias() + assert isinstance(name, dict) + for k, v in name.items(): + assert isinstance(v, tf.Variable) + elif model_class in list_other_models_with_output_ebd: + x = model.get_output_embeddings() + assert isinstance(x, keras.layers.Layer) + name = model.get_bias() + assert name is None + else: + x = model.get_output_embeddings() + assert x is None + name = model.get_bias() + assert name is None + + @slow + def test_model_from_pretrained(self): + model_name = "Salesforce/ctrl" + model = TFCTRLModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_tf +class TFCTRLModelLanguageGenerationTest(unittest.TestCase): + @slow + def test_lm_generate_ctrl(self): + model = TFCTRLLMHeadModel.from_pretrained("Salesforce/ctrl") + input_ids = tf.convert_to_tensor([[11859, 0, 1611, 8]], dtype=tf.int32) # Legal the president is + expected_output_ids = [ + 11859, + 0, + 1611, + 8, + 5, + 150, + 26449, + 2, + 19, + 348, + 469, + 3, + 2595, + 48, + 20740, + 246533, + 246533, + 19, + 30, + 5, + ] # Legal the president is a good guy and I don't want to lose my job. \n \n I have a + + output_ids = model.generate(input_ids, do_sample=False) + self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids) diff --git a/docs/transformers/tests/models/ctrl/test_tokenization_ctrl.py b/docs/transformers/tests/models/ctrl/test_tokenization_ctrl.py new file mode 100644 index 0000000000000000000000000000000000000000..f62e49708cdc77a12f522a27b55038e49b0c2a09 --- /dev/null +++ b/docs/transformers/tests/models/ctrl/test_tokenization_ctrl.py @@ -0,0 +1,71 @@ +# Copyright 2018 Salesforce and HuggingFace Inc. team. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import unittest +from functools import lru_cache + +from transformers.models.ctrl.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer + +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible + + +class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "Salesforce/ctrl" + tokenizer_class = CTRLTokenizer + test_rust_tokenizer = False + test_seq2seq = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt + vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", ""] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["#version: 0.2", "a p", "ap t", "r e", "a d", "ad apt", ""] + cls.special_tokens_map = {"unk_token": ""} + + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + with open(cls.merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return CTRLTokenizer.from_pretrained(pretrained_name, **kwargs) + + def get_input_output_texts(self, tokenizer): + input_text = "adapt react readapt apt" + output_text = "adapt react readapt apt" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) + text = "adapt react readapt apt" + bpe_tokens = "adapt re@@ a@@ c@@ t re@@ adapt apt".split() + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [tokenizer.unk_token] + + input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) diff --git a/docs/transformers/tests/models/cvt/__init__.py b/docs/transformers/tests/models/cvt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/cvt/test_modeling_cvt.py b/docs/transformers/tests/models/cvt/test_modeling_cvt.py new file mode 100644 index 0000000000000000000000000000000000000000..cb7007bb6b1cfd1702ad979da3517ee9a39dcbf9 --- /dev/null +++ b/docs/transformers/tests/models/cvt/test_modeling_cvt.py @@ -0,0 +1,270 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch CvT model.""" + +import unittest +from math import floor + +from transformers import CvtConfig +from transformers.file_utils import cached_property, is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import CvtForImageClassification, CvtModel + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class CvtConfigTester(ConfigTester): + def create_and_test_config_common_properties(self): + config = self.config_class(**self.inputs_dict) + self.parent.assertTrue(hasattr(config, "embed_dim")) + self.parent.assertTrue(hasattr(config, "num_heads")) + + +class CvtModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=64, + num_channels=3, + embed_dim=[16, 32, 48], + num_heads=[1, 2, 3], + depth=[1, 2, 10], + patch_sizes=[7, 3, 3], + patch_stride=[4, 2, 2], + patch_padding=[2, 1, 1], + stride_kv=[2, 2, 2], + cls_token=[False, False, True], + attention_drop_rate=[0.0, 0.0, 0.0], + initializer_range=0.02, + layer_norm_eps=1e-12, + is_training=True, + use_labels=True, + num_labels=2, # Check + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_sizes = patch_sizes + self.patch_stride = patch_stride + self.patch_padding = patch_padding + self.is_training = is_training + self.use_labels = use_labels + self.num_labels = num_labels + self.num_channels = num_channels + self.embed_dim = embed_dim + self.num_heads = num_heads + self.stride_kv = stride_kv + self.depth = depth + self.cls_token = cls_token + self.attention_drop_rate = attention_drop_rate + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.num_labels) + + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return CvtConfig( + image_size=self.image_size, + num_labels=self.num_labels, + num_channels=self.num_channels, + embed_dim=self.embed_dim, + num_heads=self.num_heads, + patch_sizes=self.patch_sizes, + patch_padding=self.patch_padding, + patch_stride=self.patch_stride, + stride_kv=self.stride_kv, + depth=self.depth, + cls_token=self.cls_token, + attention_drop_rate=self.attention_drop_rate, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = CvtModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + image_size = (self.image_size, self.image_size) + height, width = image_size[0], image_size[1] + for i in range(len(self.depth)): + height = floor(((height + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1) + width = floor(((width + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dim[-1], height, width)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = CvtForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class CvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Cvt does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (CvtModel, CvtForImageClassification) if is_torch_available() else () + pipeline_model_mapping = ( + {"image-feature-extraction": CvtModel, "image-classification": CvtForImageClassification} + if is_torch_available() + else {} + ) + + test_pruning = False + test_torchscript = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = CvtModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=CvtConfig, + has_text_modality=False, + hidden_size=37, + common_properties=["hidden_size", "num_channels"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Cvt does not output attentions") + def test_attention_outputs(self): + pass + + @unittest.skip(reason="Cvt does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Cvt does not support input and output embeddings") + def test_model_get_set_embeddings(self): + pass + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = len(self.model_tester.depth) + self.assertEqual(len(hidden_states), expected_num_layers) + + # verify the first hidden states (first block) + self.assertListEqual( + list(hidden_states[0].shape[-3:]), + [ + self.model_tester.embed_dim[0], + self.model_tester.image_size // 4, + self.model_tester.image_size // 4, + ], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "microsoft/cvt-13" + model = CvtModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class CvtModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("microsoft/cvt-13") + + @slow + def test_inference_image_classification_head(self): + model = CvtForImageClassification.from_pretrained("microsoft/cvt-13").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor([0.9285, 0.9015, -0.3150]).to(torch_device) + + torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/cvt/test_modeling_tf_cvt.py b/docs/transformers/tests/models/cvt/test_modeling_tf_cvt.py new file mode 100644 index 0000000000000000000000000000000000000000..211529719aa1d58b00b1323d35fcbd8ef39b5f7e --- /dev/null +++ b/docs/transformers/tests/models/cvt/test_modeling_tf_cvt.py @@ -0,0 +1,286 @@ +"""Testing suite for the Tensorflow CvT model.""" + +from __future__ import annotations + +import inspect +import unittest +from math import floor + +import numpy as np + +from transformers import CvtConfig +from transformers.testing_utils import require_tf, require_vision, slow +from transformers.utils import cached_property, is_tf_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import TFCvtForImageClassification, TFCvtModel + from transformers.modeling_tf_utils import keras + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class TFCvtConfigTester(ConfigTester): + def create_and_test_config_common_properties(self): + config = self.config_class(**self.inputs_dict) + self.parent.assertTrue(hasattr(config, "embed_dim")) + self.parent.assertTrue(hasattr(config, "num_heads")) + + +class TFCvtModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=64, + num_channels=3, + embed_dim=[16, 32, 48], + num_heads=[1, 2, 3], + depth=[1, 2, 10], + patch_sizes=[7, 3, 3], + patch_stride=[4, 2, 2], + patch_padding=[2, 1, 1], + stride_kv=[2, 2, 2], + cls_token=[False, False, True], + attention_drop_rate=[0.0, 0.0, 0.0], + initializer_range=0.02, + layer_norm_eps=1e-12, + is_training=True, + use_labels=True, + num_labels=2, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_sizes = patch_sizes + self.patch_stride = patch_stride + self.patch_padding = patch_padding + self.is_training = is_training + self.use_labels = use_labels + self.num_labels = num_labels + self.num_channels = num_channels + self.embed_dim = embed_dim + self.num_heads = num_heads + self.stride_kv = stride_kv + self.depth = depth + self.cls_token = cls_token + self.attention_drop_rate = attention_drop_rate + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + # create a random int32 tensor of given shape + labels = ids_tensor([self.batch_size], self.num_labels) + + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return CvtConfig( + image_size=self.image_size, + num_labels=self.num_labels, + num_channels=self.num_channels, + embed_dim=self.embed_dim, + num_heads=self.num_heads, + patch_sizes=self.patch_sizes, + patch_padding=self.patch_padding, + patch_stride=self.patch_stride, + stride_kv=self.stride_kv, + depth=self.depth, + cls_token=self.cls_token, + attention_drop_rate=self.attention_drop_rate, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = TFCvtModel(config=config) + result = model(pixel_values, training=False) + image_size = (self.image_size, self.image_size) + height, width = image_size[0], image_size[1] + for i in range(len(self.depth)): + height = floor(((height + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1) + width = floor(((width + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dim[-1], height, width)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = TFCvtForImageClassification(config) + result = model(pixel_values, labels=labels, training=False) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class TFCvtModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Cvt + does not use input_ids, inputs_embeds, attention_mask and seq_length. + """ + + all_model_classes = (TFCvtModel, TFCvtForImageClassification) if is_tf_available() else () + pipeline_model_mapping = ( + {"feature-extraction": TFCvtModel, "image-classification": TFCvtForImageClassification} + if is_tf_available() + else {} + ) + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + test_onnx = False + + def setUp(self): + self.model_tester = TFCvtModelTester(self) + self.config_tester = TFCvtConfigTester(self, config_class=CvtConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.create_and_test_config_common_properties() + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + self.config_tester.check_config_arguments_init() + + @unittest.skip(reason="Cvt does not output attentions") + def test_attention_outputs(self): + pass + + @unittest.skip(reason="Cvt does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Cvt does not support input and output embeddings") + def test_model_common_attributes(self): + pass + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + def test_dataset_conversion(self): + super().test_dataset_conversion() + + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) == 0, + reason="TF does not support backprop for grouped convolutions on CPU.", + ) + @slow + def test_keras_fit(self): + super().test_keras_fit() + + @unittest.skip(reason="Get `Failed to determine best cudnn convolution algo.` error after using TF 2.12+cuda 11.8") + def test_keras_fit_mixed_precision(self): + policy = keras.mixed_precision.Policy("mixed_float16") + keras.mixed_precision.set_global_policy(policy) + super().test_keras_fit() + keras.mixed_precision.set_global_policy("float32") + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + hidden_states = outputs.hidden_states + + expected_num_layers = len(self.model_tester.depth) + self.assertEqual(len(hidden_states), expected_num_layers) + + # verify the first hidden states (first block) + self.assertListEqual( + list(hidden_states[0].shape[-3:]), + [ + self.model_tester.embed_dim[0], + self.model_tester.image_size // 4, + self.model_tester.image_size // 4, + ], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "microsoft/cvt-13" + model = TFCvtModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_tf +@require_vision +class TFCvtModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("microsoft/cvt-13") + + @slow + def test_inference_image_classification_head(self): + model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13") + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="tf") + + # forward pass + outputs = model(**inputs) + + # verify the logits + expected_shape = tf.TensorShape((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = tf.constant([0.9285, 0.9015, -0.3150]) + self.assertTrue(np.allclose(outputs.logits[0, :3].numpy(), expected_slice, atol=1e-4)) diff --git a/docs/transformers/tests/models/dab_detr/__init__.py b/docs/transformers/tests/models/dab_detr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/dab_detr/test_modeling_dab_detr.py b/docs/transformers/tests/models/dab_detr/test_modeling_dab_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..e040a2c6624d939fb81af61a1f7eeeae1a1cc853 --- /dev/null +++ b/docs/transformers/tests/models/dab_detr/test_modeling_dab_detr.py @@ -0,0 +1,830 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DAB-DETR model.""" + +import inspect +import math +import unittest + +from transformers import DabDetrConfig, ResNetConfig, is_torch_available, is_vision_available +from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + import torch.nn.functional as F + + from transformers import ( + DabDetrForObjectDetection, + DabDetrModel, + ) + + +if is_vision_available(): + from PIL import Image + + from transformers import ConditionalDetrImageProcessor + + +class DabDetrModelTester: + def __init__( + self, + parent, + batch_size=8, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=8, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_queries=12, + num_channels=3, + min_size=200, + max_size=200, + n_targets=8, + num_labels=91, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_queries = num_queries + self.num_channels = num_channels + self.min_size = min_size + self.max_size = max_size + self.n_targets = n_targets + self.num_labels = num_labels + + # we also set the expected seq length for both encoder and decoder + self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32) + self.decoder_seq_length = self.num_queries + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]) + + pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device) + + labels = None + if self.use_labels: + # labels is a list of Dict (each Dict being the labels for a given example in the batch) + labels = [] + for i in range(self.batch_size): + target = {} + target["class_labels"] = torch.randint( + high=self.num_labels, size=(self.n_targets,), device=torch_device + ) + target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device) + target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device) + labels.append(target) + + config = self.get_config() + return config, pixel_values, pixel_mask, labels + + def get_config(self): + resnet_config = ResNetConfig( + num_channels=3, + embeddings_size=10, + hidden_sizes=[10, 20, 30, 40], + depths=[1, 1, 2, 1], + hidden_act="relu", + num_labels=3, + out_features=["stage2", "stage3", "stage4"], + out_indices=[2, 3, 4], + ) + return DabDetrConfig( + hidden_size=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + num_queries=self.num_queries, + num_labels=self.num_labels, + use_timm_backbone=False, + backbone_config=resnet_config, + backbone=None, + use_pretrained_backbone=False, + ) + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask} + return config, inputs_dict + + def create_and_check_dab_detr_model(self, config, pixel_values, pixel_mask, labels): + model = DabDetrModel(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size) + ) + + def create_and_check_dab_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels): + model = DabDetrForObjectDetection(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + result = model(pixel_values=pixel_values, labels=labels) + + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + +@require_torch +class DabDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (DabDetrModel, DabDetrForObjectDetection) if is_torch_available() else () + pipeline_model_mapping = ( + { + "image-feature-extraction": DabDetrModel, + "object-detection": DabDetrForObjectDetection, + } + if is_torch_available() + else {} + ) + is_encoder_decoder = True + test_torchscript = False + test_pruning = False + test_head_masking = False + test_missing_keys = False + zero_init_hidden_state = True + test_torch_exportable = True + + # special case for head models + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ in ["DabDetrForObjectDetection"]: + labels = [] + for i in range(self.model_tester.batch_size): + target = {} + target["class_labels"] = torch.ones( + size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long + ) + target["boxes"] = torch.ones( + self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float + ) + target["masks"] = torch.ones( + self.model_tester.n_targets, + self.model_tester.min_size, + self.model_tester.max_size, + device=torch_device, + dtype=torch.float, + ) + labels.append(target) + inputs_dict["labels"] = labels + + return inputs_dict + + def setUp(self): + self.model_tester = DabDetrModelTester(self) + self.config_tester = ConfigTester(self, config_class=DabDetrConfig, has_text_modality=False) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_dab_detr_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_dab_detr_model(*config_and_inputs) + + def test_dab_detr_object_detection_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_dab_detr_object_detection_head_model(*config_and_inputs) + + # TODO: check if this works again for PyTorch 2.x.y + @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.") + def test_multi_gpu_data_parallel_forward(self): + pass + + @unittest.skip(reason="DETR does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="DETR does not use inputs_embeds") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="DETR does not use inputs_embeds") + def test_inputs_embeds_matches_input_ids(self): + pass + + @unittest.skip(reason="DETR does not have a get_input_embeddings method") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="DETR is not a generative model") + def test_generate_without_input_ids(self): + pass + + @unittest.skip(reason="DETR does not use token embeddings") + def test_resize_tokens_embeddings(self): + pass + + @slow + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def set_nan_tensor_to_zero(t): + print(t) + t[t != t] = 0 + return t + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (list, tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + torch.testing.assert_close( + set_nan_tensor_to_zero(tuple_object), + set_nan_tensor_to_zero(dict_object), + atol=1e-5, + rtol=1e-5, + msg=( + "Tuple and dict output are not equal. Difference:" + f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:" + f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has" + f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}." + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + if self.has_attentions: + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence( + model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} + ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + + self.assertEqual(len(hidden_states), expected_num_layers) + + if hasattr(self.model_tester, "encoder_seq_length"): + seq_length = self.model_tester.encoder_seq_length + if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1: + seq_length = seq_length * self.model_tester.chunk_length + else: + seq_length = self.model_tester.seq_length + + self.assertListEqual( + [hidden_states[0].shape[1], hidden_states[0].shape[2]], + [seq_length, self.model_tester.hidden_size], + ) + + if config.is_encoder_decoder: + hidden_states = outputs.decoder_hidden_states + + self.assertIsInstance(hidden_states, (list, tuple)) + + self.assertEqual(len(hidden_states), expected_num_layers) + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + + self.assertListEqual( + [hidden_states[0].shape[1], hidden_states[0].shape[2]], + [decoder_seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + # Had to modify the threshold to 2 decimals instead of 3 because sometimes it threw an error + def test_batching_equivalence(self): + """ + Tests that the model supports batching and that the output is the nearly the same for the same input in + different batch sizes. + (Why "nearly the same" not "exactly the same"? Batching uses different matmul shapes, which often leads to + different results: https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535) + """ + + def get_tensor_equivalence_function(batched_input): + # models operating on continuous spaces have higher abs difference than LMs + # instead, we can rely on cos distance for image/speech models, similar to `diffusers` + if "input_ids" not in batched_input: + return lambda tensor1, tensor2: ( + 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=1e-38) + ) + return lambda tensor1, tensor2: torch.max(torch.abs(tensor1 - tensor2)) + + def recursive_check(batched_object, single_row_object, model_name, key): + if isinstance(batched_object, (list, tuple)): + for batched_object_value, single_row_object_value in zip(batched_object, single_row_object): + recursive_check(batched_object_value, single_row_object_value, model_name, key) + elif isinstance(batched_object, dict): + for batched_object_value, single_row_object_value in zip( + batched_object.values(), single_row_object.values() + ): + recursive_check(batched_object_value, single_row_object_value, model_name, key) + # do not compare returned loss (0-dim tensor) / codebook ids (int) / caching objects + elif batched_object is None or not isinstance(batched_object, torch.Tensor): + return + elif batched_object.dim() == 0: + return + else: + # indexing the first element does not always work + # e.g. models that output similarity scores of size (N, M) would need to index [0, 0] + slice_ids = [slice(0, index) for index in single_row_object.shape] + batched_row = batched_object[slice_ids] + self.assertFalse( + torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}" + ) + self.assertFalse( + torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}" + ) + self.assertTrue( + (equivalence(batched_row, single_row_object)) <= 1e-02, + msg=( + f"Batched and Single row outputs are not equal in {model_name} for key={key}. " + f"Difference={equivalence(batched_row, single_row_object)}." + ), + ) + + config, batched_input = self.model_tester.prepare_config_and_inputs_for_common() + equivalence = get_tensor_equivalence_function(batched_input) + + for model_class in self.all_model_classes: + config.output_hidden_states = True + + model_name = model_class.__name__ + if hasattr(self.model_tester, "prepare_config_and_inputs_for_model_class"): + config, batched_input = self.model_tester.prepare_config_and_inputs_for_model_class(model_class) + batched_input_prepared = self._prepare_for_class(batched_input, model_class) + model = model_class(config).to(torch_device).eval() + + batch_size = self.model_tester.batch_size + single_row_input = {} + for key, value in batched_input_prepared.items(): + if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0: + # e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size + single_batch_shape = value.shape[0] // batch_size + single_row_input[key] = value[:single_batch_shape] + else: + single_row_input[key] = value + + with torch.no_grad(): + model_batched_output = model(**batched_input_prepared) + model_row_output = model(**single_row_input) + + if isinstance(model_batched_output, torch.Tensor): + model_batched_output = {"model_output": model_batched_output} + model_row_output = {"model_output": model_row_output} + + for key in model_batched_output: + # DETR starts from zero-init queries to decoder, leading to cos_similarity = `nan` + if hasattr(self, "zero_init_hidden_state") and "decoder_hidden_states" in key: + model_batched_output[key] = model_batched_output[key][1:] + model_row_output[key] = model_row_output[key][1:] + recursive_check(model_batched_output[key], model_row_output[key], model_name, key) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + decoder_seq_length = self.model_tester.decoder_seq_length + encoder_seq_length = self.model_tester.encoder_seq_length + decoder_key_length = self.model_tester.decoder_seq_length + encoder_key_length = self.model_tester.encoder_seq_length + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + del inputs_dict["output_hidden_states"] + config.output_attentions = True + config.output_hidden_states = False + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + out_len = len(outputs) + if self.is_encoder_decoder: + correct_outlen = 6 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + encoder_key_length, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + # decoder_hidden_states, encoder_last_hidden_state, encoder_hidden_states + added_hidden_states = 3 + else: + added_hidden_states = 1 + + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + + def test_retain_grad_hidden_states_attentions(self): + # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs, output_attentions=True, output_hidden_states=True) + + # logits + output = outputs[0] + + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_hidden_states.retain_grad() + + encoder_attentions = outputs.encoder_attentions[0] + encoder_attentions.retain_grad() + + decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions.retain_grad() + + cross_attentions = outputs.cross_attentions[0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + + def test_forward_auxiliary_loss(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.auxiliary_loss = True + + # only test for object detection and segmentation model + for model_class in self.all_model_classes[1:]: + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + + outputs = model(**inputs) + + self.assertIsNotNone(outputs.auxiliary_outputs) + self.assertEqual(len(outputs.auxiliary_outputs), self.model_tester.num_hidden_layers - 1) + + def test_training(self): + if not self.model_tester.is_training: + self.skipTest(reason="ModelTester is not configured to run training tests") + + # We only have loss with ObjectDetection + model_class = self.all_model_classes[-1] + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + if model.config.is_encoder_decoder: + expected_arg_names = ["pixel_values", "pixel_mask"] + expected_arg_names.extend( + ["head_mask", "decoder_head_mask", "encoder_outputs"] + if "head_mask" and "decoder_head_mask" in arg_names + else [] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + else: + expected_arg_names = ["pixel_values", "pixel_mask"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_different_timm_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # let's pick a random timm backbone + config.backbone = "tf_mobilenetv3_small_075" + config.backbone_config = None + config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "DabDetrForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + else: + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3) + + self.assertTrue(outputs) + + def test_initialization(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + configs_no_init.init_xavier_std = 1e9 + # Copied from RT-DETR + configs_no_init.initializer_bias_prior_prob = 0.2 + bias_value = -1.3863 # log_e ((1 - 0.2) / 0.2) + + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + if "bbox_attention" in name and "bias" not in name: + self.assertLess( + 100000, + abs(param.data.max().item()), + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + # Modified from RT-DETR + elif "class_embed" in name and "bias" in name: + bias_tensor = torch.full_like(param.data, bias_value) + torch.testing.assert_close( + param.data, + bias_tensor, + atol=1e-4, + rtol=1e-4, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + elif "activation_fn" in name and config.activation_function == "prelu": + self.assertTrue( + param.data.mean() == 0.25, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + elif "backbone.conv_encoder.model" in name: + continue + elif "self_attn.in_proj_weight" in name: + self.assertIn( + ((param.data.mean() * 1e2).round() / 1e2).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + +TOLERANCE = 1e-4 +CHECKPOINT = "IDEA-Research/dab-detr-resnet-50" + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_timm +@require_vision +@slow +class DabDetrModelIntegrationTests(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ConditionalDetrImageProcessor.from_pretrained(CHECKPOINT) if is_vision_available() else None + + def test_inference_no_head(self): + model = DabDetrModel.from_pretrained(CHECKPOINT).to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values=encoding.pixel_values) + + expected_shape = torch.Size((1, 300, 256)) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + expected_slice = torch.tensor( + [[-0.4879, -0.2594, 0.4524], [-0.4997, -0.4258, 0.4329], [-0.8220, -0.4996, 0.0577]] + ).to(torch_device) + torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=2e-4, rtol=2e-4) + + def test_inference_object_detection_head(self): + model = DabDetrForObjectDetection.from_pretrained(CHECKPOINT).to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values) + + # verify logits + box predictions + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + expected_slice_logits = torch.tensor( + [[-10.1765, -5.5243, -8.9324], [-9.8138, -5.6721, -7.5161], [-10.3054, -5.6081, -8.5931]] + ).to(torch_device) + torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, atol=3e-4, rtol=3e-4) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + expected_slice_boxes = torch.tensor( + [[0.3708, 0.3000, 0.2753], [0.5211, 0.6125, 0.9495], [0.2897, 0.6730, 0.5459]] + ).to(torch_device) + torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4, rtol=1e-4) + + # verify postprocessing + results = image_processor.post_process_object_detection( + outputs, threshold=0.3, target_sizes=[image.size[::-1]] + )[0] + expected_scores = torch.tensor([0.8732, 0.8563, 0.8554, 0.6079, 0.5896]).to(torch_device) + expected_labels = [17, 75, 17, 75, 63] + expected_boxes = torch.tensor([14.6970, 49.3892, 320.5165, 469.2765]).to(torch_device) + + self.assertEqual(len(results["scores"]), 5) + torch.testing.assert_close(results["scores"], expected_scores, atol=1e-4, rtol=1e-4) + self.assertSequenceEqual(results["labels"].tolist(), expected_labels) + torch.testing.assert_close(results["boxes"][0, :], expected_boxes, atol=1e-4, rtol=1e-4) diff --git a/docs/transformers/tests/models/dac/__init__.py b/docs/transformers/tests/models/dac/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/dac/test_feature_extraction_dac.py b/docs/transformers/tests/models/dac/test_feature_extraction_dac.py new file mode 100644 index 0000000000000000000000000000000000000000..13d72326078756f5ba80dc5fa075da7559d4e018 --- /dev/null +++ b/docs/transformers/tests/models/dac/test_feature_extraction_dac.py @@ -0,0 +1,215 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for the dac feature extractor.""" + +import itertools +import random +import unittest + +import numpy as np + +from transformers import DacFeatureExtractor +from transformers.testing_utils import require_torch +from transformers.utils.import_utils import is_torch_available + +from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin + + +if is_torch_available(): + import torch + + +global_rng = random.Random() + + +# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list +def floats_list(shape, scale=1.0, rng=None, name=None): + """Creates a random float32 tensor""" + if rng is None: + rng = global_rng + + values = [] + for batch_idx in range(shape[0]): + values.append([]) + for _ in range(shape[1]): + values[-1].append(rng.random() * scale) + + return values + + +@require_torch +# Copied from transformers.tests.encodec.test_feature_extraction_dac.EncodecFeatureExtractionTester with Encodec->Dac +class DacFeatureExtractionTester: + # Ignore copy + def __init__( + self, + parent, + batch_size=7, + min_seq_length=400, + max_seq_length=2000, + feature_size=1, + padding_value=0.0, + sampling_rate=16000, + hop_length=512, + ): + self.parent = parent + self.batch_size = batch_size + self.min_seq_length = min_seq_length + self.max_seq_length = max_seq_length + self.hop_length = hop_length + self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1) + self.feature_size = feature_size + self.padding_value = padding_value + self.sampling_rate = sampling_rate + + # Ignore copy + def prepare_feat_extract_dict(self): + return { + "feature_size": self.feature_size, + "padding_value": self.padding_value, + "sampling_rate": self.sampling_rate, + "hop_length": self.hop_length, + } + + def prepare_inputs_for_common(self, equal_length=False, numpify=False): + def _flatten(list_of_lists): + return list(itertools.chain(*list_of_lists)) + + if equal_length: + audio_inputs = floats_list((self.batch_size, self.max_seq_length)) + else: + # make sure that inputs increase in size + audio_inputs = [ + _flatten(floats_list((x, self.feature_size))) + for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff) + ] + + if numpify: + audio_inputs = [np.asarray(x) for x in audio_inputs] + + return audio_inputs + + +@require_torch +# Copied from transformers.tests.encodec.test_feature_extraction_dac.EnCodecFeatureExtractionTest with Encodec->Dac +class DacFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): + feature_extraction_class = DacFeatureExtractor + + def setUp(self): + self.feat_extract_tester = DacFeatureExtractionTester(self) + + def test_call(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + # create three inputs of length 800, 1000, and 1200 + audio_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + np_audio_inputs = [np.asarray(audio_input) for audio_input in audio_inputs] + + # Test not batched input + encoded_sequences_1 = feat_extract(audio_inputs[0], return_tensors="np").input_values + encoded_sequences_2 = feat_extract(np_audio_inputs[0], return_tensors="np").input_values + self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) + + # Test batched + encoded_sequences_1 = feat_extract(audio_inputs, padding=True, return_tensors="np").input_values + encoded_sequences_2 = feat_extract(np_audio_inputs, padding=True, return_tensors="np").input_values + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + def test_double_precision_pad(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + np_audio_inputs = np.random.rand(100).astype(np.float64) + py_audio_inputs = np_audio_inputs.tolist() + + for inputs in [py_audio_inputs, np_audio_inputs]: + np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np") + self.assertTrue(np_processed.input_values.dtype == np.float32) + pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt") + self.assertTrue(pt_processed.input_values.dtype == torch.float32) + + def _load_datasamples(self, num_samples): + from datasets import load_dataset + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + + return [x["array"] for x in audio_samples] + + def test_integration(self): + # fmt: off + EXPECTED_INPUT_VALUES = torch.tensor( + [ 2.3803711e-03, 2.0751953e-03, 1.9836426e-03, 2.1057129e-03, + 1.6174316e-03, 3.0517578e-04, 9.1552734e-05, 3.3569336e-04, + 9.7656250e-04, 1.8310547e-03, 2.0141602e-03, 2.1057129e-03, + 1.7395020e-03, 4.5776367e-04, -3.9672852e-04, 4.5776367e-04, + 1.0070801e-03, 9.1552734e-05, 4.8828125e-04, 1.1596680e-03, + 7.3242188e-04, 9.4604492e-04, 1.8005371e-03, 1.8310547e-03, + 8.8500977e-04, 4.2724609e-04, 4.8828125e-04, 7.3242188e-04, + 1.0986328e-03, 2.1057129e-03] + ) + # fmt: on + input_audio = self._load_datasamples(1) + feature_extractor = DacFeatureExtractor() + input_values = feature_extractor(input_audio, return_tensors="pt")["input_values"] + self.assertEqual(input_values.shape, (1, 1, 93696)) + torch.testing.assert_close(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, rtol=1e-4, atol=1e-4) + audio_input_end = torch.tensor(input_audio[0][-30:], dtype=torch.float32) + torch.testing.assert_close(input_values[0, 0, -46:-16], audio_input_end, rtol=1e-4, atol=1e-4) + + # Ignore copy + @unittest.skip("The DAC model doesn't support stereo logic") + def test_integration_stereo(self): + pass + + # Ignore copy + def test_truncation_and_padding(self): + input_audio = self._load_datasamples(2) + # would be easier if the stride was like + feature_extractor = DacFeatureExtractor() + + # pad and trunc raise an error ? + with self.assertRaisesRegex( + ValueError, + "^Both padding and truncation were set. Make sure you only set one.$", + ): + truncated_outputs = feature_extractor( + input_audio, padding="max_length", truncation=True, return_tensors="pt" + ).input_values + + # force truncate to max_length + truncated_outputs = feature_extractor( + input_audio, truncation=True, max_length=48000, return_tensors="pt" + ).input_values + self.assertEqual(truncated_outputs.shape, (2, 1, 48128)) + + # pad: + padded_outputs = feature_extractor(input_audio, padding=True, return_tensors="pt").input_values + self.assertEqual(padded_outputs.shape, (2, 1, 93696)) + + # force pad to max length + truncated_outputs = feature_extractor( + input_audio, padding="max_length", max_length=100000, return_tensors="pt" + ).input_values + self.assertEqual(truncated_outputs.shape, (2, 1, 100352)) + + # force no pad + with self.assertRaisesRegex( + ValueError, + "^Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.$", + ): + truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values + + truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values + self.assertEqual(truncated_outputs.shape, (1, 1, 93680)) diff --git a/docs/transformers/tests/models/dac/test_modeling_dac.py b/docs/transformers/tests/models/dac/test_modeling_dac.py new file mode 100644 index 0000000000000000000000000000000000000000..d001a953cef4df381f8d8d83c932c63c3b0c56d2 --- /dev/null +++ b/docs/transformers/tests/models/dac/test_modeling_dac.py @@ -0,0 +1,756 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Dac model.""" + +import inspect +import os +import tempfile +import unittest + +import numpy as np +from datasets import Audio, load_dataset + +from transformers import AutoProcessor, DacConfig, DacModel +from transformers.testing_utils import is_torch_available, require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + +@require_torch +# Copied from transformers.tests.encodec.test_modeling_encodec.EncodecModelTester with Encodec->Dac +class DacModelTester: + # Ignore copy + def __init__( + self, + parent, + batch_size=3, + num_channels=1, + is_training=False, + intermediate_size=1024, + encoder_hidden_size=16, + downsampling_ratios=[2, 4, 4], + decoder_hidden_size=16, + n_codebooks=6, + codebook_size=512, + codebook_dim=4, + quantizer_dropout=0.0, + commitment_loss_weight=0.25, + codebook_loss_weight=1.0, + sample_rate=16000, + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.is_training = is_training + self.intermediate_size = intermediate_size + self.sample_rate = sample_rate + + self.encoder_hidden_size = encoder_hidden_size + self.downsampling_ratios = downsampling_ratios + self.decoder_hidden_size = decoder_hidden_size + self.n_codebooks = n_codebooks + self.codebook_size = codebook_size + self.codebook_dim = codebook_dim + self.quantizer_dropout = quantizer_dropout + self.commitment_loss_weight = commitment_loss_weight + self.codebook_loss_weight = codebook_loss_weight + + def prepare_config_and_inputs(self): + input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0) + config = self.get_config() + inputs_dict = {"input_values": input_values} + return config, inputs_dict + + def prepare_config_and_inputs_for_common(self): + config, inputs_dict = self.prepare_config_and_inputs() + return config, inputs_dict + + def prepare_config_and_inputs_for_model_class(self, model_class): + input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0) + config = self.get_config() + inputs_dict = {"input_values": input_values} + + return config, inputs_dict + + # Ignore copy + def get_config(self): + return DacConfig( + encoder_hidden_size=self.encoder_hidden_size, + downsampling_ratios=self.downsampling_ratios, + decoder_hidden_size=self.decoder_hidden_size, + n_codebooks=self.n_codebooks, + codebook_size=self.codebook_size, + codebook_dim=self.codebook_dim, + quantizer_dropout=self.quantizer_dropout, + commitment_loss_weight=self.commitment_loss_weight, + codebook_loss_weight=self.codebook_loss_weight, + ) + + # Ignore copy + def create_and_check_model_forward(self, config, inputs_dict): + model = DacModel(config=config).to(torch_device).eval() + + input_values = inputs_dict["input_values"] + result = model(input_values) + self.parent.assertEqual(result.audio_values.shape, (self.batch_size, self.intermediate_size)) + + +@require_torch +# Copied from transformers.tests.encodec.test_modeling_encodec.EncodecModelTest with Encodec->Dac +class DacModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (DacModel,) if is_torch_available() else () + is_encoder_decoder = True + test_pruning = False + test_headmasking = False + test_resize_embeddings = False + pipeline_model_mapping = {"feature-extraction": DacModel} if is_torch_available() else {} + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + # model does not have attention and does not support returning hidden states + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + if "output_attentions" in inputs_dict: + inputs_dict.pop("output_attentions") + if "output_hidden_states" in inputs_dict: + inputs_dict.pop("output_hidden_states") + return inputs_dict + + def setUp(self): + self.model_tester = DacModelTester(self) + self.config_tester = ConfigTester( + self, config_class=DacConfig, hidden_size=37, common_properties=[], has_text_modality=False + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model_forward(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_forward(*config_and_inputs) + + # TODO (ydshieh): Although we have a potential cause, it's still strange that this test fails all the time with large differences + @unittest.skip(reason="Might be caused by `indices` computed with `max()` in `decode_latents`") + def test_batching_equivalence(self): + super().test_batching_equivalence() + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + # Ignore copy + expected_arg_names = ["input_values", "n_quantizers", "return_dict"] + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + @unittest.skip("The DacModel is not transformers based, thus it does not have `inputs_embeds` logics") + def test_inputs_embeds(self): + pass + + @unittest.skip("The DacModel is not transformers based, thus it does not have `inputs_embeds` logics") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip("The DacModel is not transformers based, thus it does not have the usual `attention` logic") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip("The DacModel is not transformers based, thus it does not have the usual `attention` logic") + def test_torchscript_output_attentions(self): + pass + + @unittest.skip("The DacModel is not transformers based, thus it does not have the usual `hidden_states` logic") + def test_torchscript_output_hidden_state(self): + pass + + def _create_and_check_torchscript(self, config, inputs_dict): + if not self.test_torchscript: + return + + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.torchscript = True + configs_no_init.return_dict = False + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + inputs = self._prepare_for_class(inputs_dict, model_class) + + main_input_name = model_class.main_input_name + + try: + main_input = inputs[main_input_name] + model(main_input) + traced_model = torch.jit.trace(model, main_input) + except RuntimeError: + self.fail("Couldn't trace module.") + + with tempfile.TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") + + try: + torch.jit.save(traced_model, pt_file_name) + except Exception: + self.fail("Couldn't save module.") + + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load module.") + + model.to(torch_device) + model.eval() + + loaded_model.to(torch_device) + loaded_model.eval() + + model_state_dict = model.state_dict() + loaded_model_state_dict = loaded_model.state_dict() + + non_persistent_buffers = {} + for key in loaded_model_state_dict.keys(): + if key not in model_state_dict.keys(): + non_persistent_buffers[key] = loaded_model_state_dict[key] + + loaded_model_state_dict = { + key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers + } + + self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) + + model_buffers = list(model.buffers()) + for non_persistent_buffer in non_persistent_buffers.values(): + found_buffer = False + for i, model_buffer in enumerate(model_buffers): + if torch.equal(non_persistent_buffer, model_buffer): + found_buffer = True + break + + self.assertTrue(found_buffer) + model_buffers.pop(i) + + model_buffers = list(model.buffers()) + for non_persistent_buffer in non_persistent_buffers.values(): + found_buffer = False + for i, model_buffer in enumerate(model_buffers): + if torch.equal(non_persistent_buffer, model_buffer): + found_buffer = True + break + + self.assertTrue(found_buffer) + model_buffers.pop(i) + + models_equal = True + for layer_name, p1 in model_state_dict.items(): + if layer_name in loaded_model_state_dict: + p2 = loaded_model_state_dict[layer_name] + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + # Avoid memory leak. Without this, each call increase RAM usage by ~20MB. + # (Even with this call, there are still memory leak by ~0.04MB) + self.clear_torch_jit_class_registry() + + @unittest.skip("The DacModel is not transformers based, thus it does not have the usual `attention` logic") + def test_attention_outputs(self): + pass + + @unittest.skip("The DacModel is not transformers based, thus it does not have the usual `hidden_states` logic") + def test_hidden_states_output(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip("No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + + def test_determinism(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_determinism(first, second): + # outputs are not tensors but list (since each sequence don't have the same frame_length) + out_1 = first.cpu().numpy() + out_2 = second.cpu().numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + first = model(**self._prepare_for_class(inputs_dict, model_class))[0] + second = model(**self._prepare_for_class(inputs_dict, model_class))[0] + + if isinstance(first, tuple) and isinstance(second, tuple): + for tensor1, tensor2 in zip(first, second): + check_determinism(tensor1, tensor2) + else: + check_determinism(first, second) + + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def set_nan_tensor_to_zero(t): + t[t != t] = 0 + return t + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (list, tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + torch.allclose( + set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 + ), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:" + f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has" + f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}." + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + # Ignore copy + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + uniform_init_parms = ["conv", "in_proj", "out_proj", "codebook"] + if param.requires_grad: + if any(x in name for x in uniform_init_parms): + self.assertTrue( + -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_identity_shortcut(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs() + config.use_conv_shortcut = False + self.model_tester.create_and_check_model_forward(config, inputs_dict) + + +def normalize(arr): + norm = np.linalg.norm(arr) + normalized_arr = arr / norm + return normalized_arr + + +def compute_rmse(arr1, arr2): + arr1_normalized = normalize(arr1) + arr2_normalized = normalize(arr2) + return np.sqrt(((arr1_normalized - arr2_normalized) ** 2).mean()) + + +@slow +@require_torch +class DacIntegrationTest(unittest.TestCase): + def test_integration_16khz(self): + expected_rmse = 0.004 + + expected_encoder_sums_dict = { + "loss": 24.8596, + "quantized_representation": -0.0745, + "audio_codes": 504.0948, + "projected_latents": 0.0682, + } + + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + + model_name = "dac_16khz" + + model_id = f"descript/{model_name}" + model = DacModel.from_pretrained(model_id, force_download=True).to(torch_device).eval() + processor = AutoProcessor.from_pretrained(model_id) + + librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) + audio_sample = librispeech_dummy[0]["audio"]["array"] + + inputs = processor( + raw_audio=audio_sample, + sampling_rate=processor.sampling_rate, + return_tensors="pt", + ).to(torch_device) + + with torch.no_grad(): + encoder_outputs = model.encode(inputs["input_values"]) + + expected_encoder_sums = torch.tensor(list(expected_encoder_sums_dict.values()), dtype=torch.float32) + encoder_outputs_mean = torch.tensor([v.float().mean().cpu().item() for v in encoder_outputs.to_tuple()]) + + # make sure audio encoded codes are correct + torch.testing.assert_close(encoder_outputs_mean, expected_encoder_sums, rtol=1e-3, atol=1e-3) + + _, quantized_representation, _, _ = encoder_outputs.to_tuple() + input_values_dec = model.decode(quantized_representation)[0] + input_values_enc_dec = model(inputs["input_values"])[1] + + # make sure forward and decode gives same result + torch.testing.assert_close(input_values_dec, input_values_enc_dec, rtol=1e-3, atol=1e-3) + + arr = inputs["input_values"][0].cpu().numpy() + arr_enc_dec = input_values_enc_dec[0].cpu().numpy() + + max_length = min(arr_enc_dec.shape[-1], arr.shape[-1]) + + arr_cut = arr[0, :max_length].copy() + arr_enc_dec_cut = arr_enc_dec[:max_length].copy() + + # make sure audios are more or less equal + rmse = compute_rmse(arr_cut, arr_enc_dec_cut) + self.assertTrue(rmse < expected_rmse) + + def test_integration_24khz(self): + expected_rmse = 0.0039 + + expected_encoder_output_dict = { + "quantized_representation": torch.tensor([0.6257, 3.1245, 5.2514, 2.3160, 1.5774]), + "audio_codes": torch.tensor([919, 919, 234, 777, 234]), + "projected_latents": torch.tensor([-4.7841, -5.0063, -4.5595, -5.0372, -5.4280]), + } + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + + model_name = "dac_24khz" + + model_id = f"descript/{model_name}" + model = DacModel.from_pretrained(model_id, force_download=True).to(torch_device).eval() + processor = AutoProcessor.from_pretrained(model_id) + + librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) + audio_sample = librispeech_dummy[0]["audio"]["array"] + + inputs = processor( + raw_audio=audio_sample, + sampling_rate=processor.sampling_rate, + return_tensors="pt", + ).to(torch_device) + + with torch.no_grad(): + encoder_outputs = model.encode(inputs["input_values"]) + + expected_quantized_representation = encoder_outputs["quantized_representation"][0, 0, :5].cpu() + expected_audio_codes = encoder_outputs["audio_codes"][0, 0, :5].cpu() + expected_projected_latents = encoder_outputs["projected_latents"][0, 0, :5].cpu() + + # make sure values are correct for audios slices + self.assertTrue( + torch.allclose( + expected_quantized_representation, + expected_encoder_output_dict["quantized_representation"], + atol=1e-3, + ) + ) + self.assertTrue( + torch.allclose(expected_audio_codes, expected_encoder_output_dict["audio_codes"], atol=1e-3) + ) + self.assertTrue( + torch.allclose( + expected_projected_latents, expected_encoder_output_dict["projected_latents"], atol=1e-3 + ) + ) + + _, quantized_representation, _, _ = encoder_outputs.to_tuple() + input_values_dec = model.decode(quantized_representation)[0] + input_values_enc_dec = model(inputs["input_values"])[1] + + input_values_from_codes = model.decode(audio_codes=encoder_outputs.audio_codes)[0] + + # make sure decode from audio codes and quantized values give more or less the same results + torch.testing.assert_close(input_values_from_codes, input_values_dec, rtol=1e-5, atol=1e-5) + + # make sure forward and decode gives same result + torch.testing.assert_close(input_values_dec, input_values_enc_dec, rtol=1e-3, atol=1e-3) + + arr = inputs["input_values"][0].cpu().numpy() + arr_enc_dec = input_values_enc_dec[0].cpu().numpy() + + max_length = min(arr_enc_dec.shape[-1], arr.shape[-1]) + + arr_cut = arr[0, :max_length].copy() + arr_enc_dec_cut = arr_enc_dec[:max_length].copy() + + # make sure audios are more or less equal + rmse = compute_rmse(arr_cut, arr_enc_dec_cut) + self.assertTrue(rmse < expected_rmse) + + def test_integration_44khz(self): + expected_rmse = 0.002 + + expected_encoder_sums_dict = { + "loss": 34.3612, + "quantized_representation": 0.0078, + "audio_codes": 509.6812, + "projected_latents": -0.1054, + } + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + + model_name = "dac_44khz" + + model_id = f"descript/{model_name}" + model = DacModel.from_pretrained(model_id).to(torch_device).eval() + processor = AutoProcessor.from_pretrained(model_id) + + librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) + audio_sample = librispeech_dummy[0]["audio"]["array"] + + inputs = processor( + raw_audio=audio_sample, + sampling_rate=processor.sampling_rate, + return_tensors="pt", + ).to(torch_device) + + with torch.no_grad(): + encoder_outputs = model.encode(inputs["input_values"]) + + expected_encoder_sums = torch.tensor(list(expected_encoder_sums_dict.values()), dtype=torch.float32) + encoder_outputs_mean = torch.tensor([v.float().mean().cpu().item() for v in encoder_outputs.to_tuple()]) + + # make sure audio encoded codes are correct + torch.testing.assert_close(encoder_outputs_mean, expected_encoder_sums, rtol=1e-3, atol=1e-3) + + _, quantized_representation, _, _ = encoder_outputs.to_tuple() + input_values_dec = model.decode(quantized_representation)[0] + input_values_enc_dec = model(inputs["input_values"])[1] + + # make sure forward and decode gives same result + torch.testing.assert_close(input_values_dec, input_values_enc_dec, rtol=1e-3, atol=1e-3) + + arr = inputs["input_values"][0].cpu().numpy() + arr_enc_dec = input_values_enc_dec[0].cpu().numpy() + + max_length = min(arr_enc_dec.shape[-1], arr.shape[-1]) + + arr_cut = arr[0, :max_length].copy() + arr_enc_dec_cut = arr_enc_dec[:max_length].copy() + + # make sure audios are more or less equal + rmse = compute_rmse(arr_cut, arr_enc_dec_cut) + self.assertTrue(rmse < expected_rmse) + + def test_integration_batch_16khz(self): + expected_rmse = 0.002 + + expected_encoder_sums_dict = { + "loss": 20.3913, + "quantized_representation": -0.0538, + "audio_codes": 487.8470, + "projected_latents": 0.0237, + } + + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + + model_name = "dac_16khz" + + model_id = f"descript/{model_name}" + model = DacModel.from_pretrained(model_id).to(torch_device) + processor = AutoProcessor.from_pretrained(model_id) + + librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) + + audio_samples = [np.array([audio_sample["array"]])[0] for audio_sample in librispeech_dummy[-2:]["audio"]] + + inputs = processor( + raw_audio=audio_samples, + sampling_rate=processor.sampling_rate, + truncation=False, + return_tensors="pt", + ).to(torch_device) + + with torch.no_grad(): + encoder_outputs = model.encode(inputs["input_values"]) + + expected_encoder_sums = torch.tensor(list(expected_encoder_sums_dict.values()), dtype=torch.float32) + encoder_outputs_mean = torch.tensor([v.float().mean().item() for v in encoder_outputs.to_tuple()]) + + # make sure audio encoded codes are correct + torch.testing.assert_close(encoder_outputs_mean, expected_encoder_sums, rtol=1e-3, atol=1e-3) + + _, quantized_representation, _, _ = encoder_outputs.to_tuple() + input_values_dec = model.decode(quantized_representation)[0] + input_values_enc_dec = model(inputs["input_values"])[1] + + # make sure forward and decode gives same result + torch.testing.assert_close(input_values_dec, input_values_enc_dec, rtol=1e-3, atol=1e-3) + + arr = inputs["input_values"].cpu().numpy() + arr_enc_dec = input_values_enc_dec.cpu().numpy() + + max_length = min(arr_enc_dec.shape[-1], arr.shape[-1]) + + arr_cut = arr[:, 0, :max_length].copy() + arr_enc_dec_cut = arr_enc_dec[:, :max_length].copy() + + # make sure audios are more or less equal + rmse = compute_rmse(arr_cut, arr_enc_dec_cut) + self.assertTrue(rmse < expected_rmse) + + def test_integration_batch_24khz(self): + expected_rmse = 0.002 + + expected_encoder_sums_dict = { + "loss": 24.2309, + "quantized_representation": 0.0520, + "audio_codes": 510.2700, + "projected_latents": -0.0076, + } + + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + + model_name = "dac_24khz" + + model_id = f"descript/{model_name}" + model = DacModel.from_pretrained(model_id).to(torch_device) + processor = AutoProcessor.from_pretrained(model_id) + + librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) + + audio_samples = [np.array([audio_sample["array"]])[0] for audio_sample in librispeech_dummy[-2:]["audio"]] + + inputs = processor( + raw_audio=audio_samples, + sampling_rate=processor.sampling_rate, + truncation=False, + return_tensors="pt", + ).to(torch_device) + + with torch.no_grad(): + encoder_outputs = model.encode(inputs["input_values"]) + + expected_encoder_sums = torch.tensor(list(expected_encoder_sums_dict.values()), dtype=torch.float32) + encoder_outputs_mean = torch.tensor([v.float().mean().cpu().item() for v in encoder_outputs.to_tuple()]) + + # make sure audio encoded codes are correct + torch.testing.assert_close(encoder_outputs_mean, expected_encoder_sums, rtol=1e-3, atol=1e-3) + + _, quantized_representation, _, _ = encoder_outputs.to_tuple() + input_values_dec = model.decode(quantized_representation)[0] + input_values_enc_dec = model(inputs["input_values"])[1] + + # make sure forward and decode gives same result + torch.testing.assert_close(input_values_dec, input_values_enc_dec, rtol=1e-3, atol=1e-3) + + arr = inputs["input_values"].cpu().numpy() + arr_enc_dec = input_values_enc_dec.cpu().numpy() + + max_length = min(arr_enc_dec.shape[-1], arr.shape[-1]) + + arr_cut = arr[:, 0, :max_length].copy() + arr_enc_dec_cut = arr_enc_dec[:, :max_length].copy() + + # make sure audios are more or less equal + rmse = compute_rmse(arr_cut, arr_enc_dec_cut) + self.assertTrue(rmse < expected_rmse) + + def test_integration_batch_44khz(self): + expected_rmse = 0.001 + + expected_encoder_sums_dict = { + "loss": 25.9233, + "quantized_representation": 0.0013, + "audio_codes": 528.5620, + "projected_latents": -0.1194, + } + + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + + model_name = "dac_44khz" + + model_id = f"descript/{model_name}" + model = DacModel.from_pretrained(model_id).to(torch_device) + processor = AutoProcessor.from_pretrained(model_id) + + librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) + + audio_samples = [np.array([audio_sample["array"]])[0] for audio_sample in librispeech_dummy[-2:]["audio"]] + + inputs = processor( + raw_audio=audio_samples, + sampling_rate=processor.sampling_rate, + truncation=False, + return_tensors="pt", + ).to(torch_device) + + with torch.no_grad(): + encoder_outputs = model.encode(inputs["input_values"]) + + expected_encoder_sums = torch.tensor(list(expected_encoder_sums_dict.values()), dtype=torch.float32) + encoder_outputs_mean = torch.tensor([v.float().mean().cpu().item() for v in encoder_outputs.to_tuple()]) + + # make sure audio encoded codes are correct + torch.testing.assert_close(encoder_outputs_mean, expected_encoder_sums, rtol=1e-3, atol=1e-3) + + _, quantized_representation, _, _ = encoder_outputs.to_tuple() + input_values_dec = model.decode(quantized_representation)[0] + input_values_enc_dec = model(inputs["input_values"])[1] + + # make sure forward and decode gives same result + torch.testing.assert_close(input_values_dec, input_values_enc_dec, rtol=1e-3, atol=1e-3) + + arr = inputs["input_values"].cpu().numpy() + arr_enc_dec = input_values_enc_dec.cpu().numpy() + + max_length = min(arr_enc_dec.shape[-1], arr.shape[-1]) + + arr_cut = arr[:, 0, :max_length].copy() + arr_enc_dec_cut = arr_enc_dec[:, :max_length].copy() + + # make sure audios are more or less equal + rmse = compute_rmse(arr_cut, arr_enc_dec_cut) + self.assertTrue(rmse < expected_rmse) diff --git a/docs/transformers/tests/models/data2vec/__init__.py b/docs/transformers/tests/models/data2vec/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/data2vec/test_modeling_data2vec_audio.py b/docs/transformers/tests/models/data2vec/test_modeling_data2vec_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..96e970beb6b939bc8071e066d791d06983b41a9f --- /dev/null +++ b/docs/transformers/tests/models/data2vec/test_modeling_data2vec_audio.py @@ -0,0 +1,713 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Data2VecAudio model.""" + +import math +import unittest + +import numpy as np +from datasets import load_dataset + +from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask +from transformers import Data2VecAudioConfig, is_torch_available +from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + Data2VecAudioForAudioFrameClassification, + Data2VecAudioForCTC, + Data2VecAudioForSequenceClassification, + Data2VecAudioForXVector, + Data2VecAudioModel, + Wav2Vec2Processor, + ) + from transformers.models.data2vec.modeling_data2vec_audio import _compute_mask_indices + + +class Data2VecAudioModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=1024, # speech is longer + is_training=False, + hidden_size=16, + feat_extract_dropout=0.0, + feat_extract_activation="gelu", + conv_dim=(32, 32, 32), + conv_stride=(4, 4, 4), + conv_kernel=(8, 8, 8), + conv_bias=False, + num_conv_pos_embeddings=16, + num_conv_pos_embedding_groups=2, + num_hidden_layers=2, + num_attention_heads=2, + hidden_dropout_prob=0.1, + intermediate_size=20, + layer_norm_eps=1e-5, + hidden_act="gelu", + initializer_range=0.02, + mask_time_prob=0.5, + mask_time_length=2, + vocab_size=32, + num_adapter_layers=1, + adapter_stride=2, + tdnn_dim=(32, 32), + tdnn_kernel=(5, 3), + tdnn_dilation=(1, 2), + xvector_output_dim=32, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.hidden_size = hidden_size + self.feat_extract_dropout = feat_extract_dropout + self.feat_extract_activation = feat_extract_activation + self.conv_dim = conv_dim + self.conv_stride = conv_stride + self.conv_kernel = conv_kernel + self.conv_bias = conv_bias + self.num_conv_pos_embeddings = num_conv_pos_embeddings + self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_dropout_prob = hidden_dropout_prob + self.intermediate_size = intermediate_size + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.vocab_size = vocab_size + self.num_adapter_layers = num_adapter_layers + self.adapter_stride = adapter_stride + self.mask_time_prob = mask_time_prob + self.mask_time_length = mask_time_length + self.scope = scope + self.tdnn_dim = tdnn_dim + self.tdnn_kernel = tdnn_kernel + self.tdnn_dilation = tdnn_dilation + self.xvector_output_dim = xvector_output_dim + + output_seq_length = self.seq_length + for kernel, stride in zip(self.conv_kernel, self.conv_stride): + output_seq_length = (output_seq_length - (kernel - 1)) / stride + self.output_seq_length = int(math.ceil(output_seq_length)) + self.encoder_seq_length = self.output_seq_length + + self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1 + + def prepare_config_and_inputs(self): + input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0) + attention_mask = random_attention_mask([self.batch_size, self.seq_length]) + + config = self.get_config() + + return config, input_values, attention_mask + + def get_config(self): + return Data2VecAudioConfig( + hidden_size=self.hidden_size, + feat_extract_dropout=self.feat_extract_dropout, + feat_extract_activation=self.feat_extract_activation, + conv_dim=self.conv_dim, + conv_stride=self.conv_stride, + conv_kernel=self.conv_kernel, + conv_bias=self.conv_bias, + mask_time_prob=self.mask_time_prob, + mask_time_length=self.mask_time_length, + num_conv_pos_embeddings=self.num_conv_pos_embeddings, + num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + hidden_dropout_prob=self.hidden_dropout_prob, + intermediate_size=self.intermediate_size, + layer_norm_eps=self.layer_norm_eps, + hidden_act=self.hidden_act, + initializer_range=self.initializer_range, + vocab_size=self.vocab_size, + num_adapter_layers=self.num_adapter_layers, + adapter_stride=self.adapter_stride, + tdnn_dim=self.tdnn_dim, + tdnn_kernel=self.tdnn_kernel, + tdnn_dilation=self.tdnn_dilation, + xvector_output_dim=self.xvector_output_dim, + ) + + def create_and_check_model(self, config, input_values, attention_mask): + model = Data2VecAudioModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_values, attention_mask=attention_mask) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size) + ) + + def create_and_check_model_with_adapter(self, config, input_values, attention_mask): + config.add_adapter = True + model = Data2VecAudioModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_values, attention_mask=attention_mask) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size) + ) + + def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask): + config.add_adapter = True + config.output_hidden_size = 8 + model = Data2VecAudioModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_values, attention_mask=attention_mask) + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size), + ) + + def check_ctc_loss(self, config, input_values, *args): + model = Data2VecAudioForCTC(config=config) + model.to(torch_device) + + # make sure that dropout is disabled + model.eval() + + input_values = input_values[:3] + attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long) + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) + labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size) + + # pad input + for i in range(len(input_lengths)): + input_values[i, input_lengths[i] :] = 0.0 + attention_mask[i, input_lengths[i] :] = 0 + + model.config.ctc_loss_reduction = "sum" + sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() + + model.config.ctc_loss_reduction = "mean" + mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() + + self.parent.assertTrue(isinstance(sum_loss, float)) + self.parent.assertTrue(isinstance(mean_loss, float)) + + def check_seq_classifier_loss(self, config, input_values, *args): + model = Data2VecAudioForSequenceClassification(config=config) + model.to(torch_device) + + # make sure that dropout is disabled + model.eval() + + input_values = input_values[:3] + attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long) + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label)) + + # pad input + for i in range(len(input_lengths)): + input_values[i, input_lengths[i] :] = 0.0 + attention_mask[i, input_lengths[i] :] = 0 + + masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() + unmasked_loss = model(input_values, labels=labels).loss.item() + + self.parent.assertTrue(isinstance(masked_loss, float)) + self.parent.assertTrue(isinstance(unmasked_loss, float)) + self.parent.assertTrue(masked_loss != unmasked_loss) + + def check_ctc_training(self, config, input_values, *args): + config.ctc_zero_infinity = True + model = Data2VecAudioForCTC(config=config) + model.to(torch_device) + model.train() + + # freeze feature encoder + model.freeze_feature_encoder() + + input_values = input_values[:3] + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) + labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size) + + # pad input + for i in range(len(input_lengths)): + input_values[i, input_lengths[i] :] = 0.0 + + if max_length_labels[i] < labels.shape[-1]: + # it's important that we make sure that target lengths are at least + # one shorter than logit lengths to prevent -inf + labels[i, max_length_labels[i] - 1 :] = -100 + + loss = model(input_values, labels=labels).loss + self.parent.assertFalse(torch.isinf(loss).item()) + + loss.backward() + + def check_seq_classifier_training(self, config, input_values, *args): + config.ctc_zero_infinity = True + model = Data2VecAudioForSequenceClassification(config=config) + model.to(torch_device) + model.train() + + # freeze everything but the classification head + model.freeze_base_model() + + input_values = input_values[:3] + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label)) + + # pad input + for i in range(len(input_lengths)): + input_values[i, input_lengths[i] :] = 0.0 + + loss = model(input_values, labels=labels).loss + self.parent.assertFalse(torch.isinf(loss).item()) + + loss.backward() + + def check_xvector_training(self, config, input_values, *args): + config.ctc_zero_infinity = True + model = Data2VecAudioForXVector(config=config) + model.to(torch_device) + model.train() + + # freeze everything but the classification head + model.freeze_base_model() + + input_values = input_values[:3] + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label)) + + # pad input + for i in range(len(input_lengths)): + input_values[i, input_lengths[i] :] = 0.0 + + loss = model(input_values, labels=labels).loss + self.parent.assertFalse(torch.isinf(loss).item()) + + loss.backward() + + def check_labels_out_of_vocab(self, config, input_values, *args): + model = Data2VecAudioForCTC(config) + model.to(torch_device) + model.train() + + input_values = input_values[:3] + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) + labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100) + + with self.parent.assertRaises(ValueError): + model(input_values, labels=labels) + + def prepare_config_and_inputs_for_common(self): + config, input_values, attention_mask = self.prepare_config_and_inputs() + inputs_dict = {"input_values": input_values, "attention_mask": attention_mask} + return config, inputs_dict + + +@require_torch +class Data2VecAudioModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + Data2VecAudioForCTC, + Data2VecAudioModel, + Data2VecAudioForSequenceClassification, + Data2VecAudioForAudioFrameClassification, + Data2VecAudioForXVector, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "audio-classification": Data2VecAudioForSequenceClassification, + "automatic-speech-recognition": Data2VecAudioForCTC, + "feature-extraction": Data2VecAudioModel, + } + if is_torch_available() + else {} + ) + test_pruning = False + test_headmasking = False + + def setUp(self): + self.model_tester = Data2VecAudioModelTester(self) + self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_with_adapter(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_with_adapter(*config_and_inputs) + + def test_model_with_adapter_proj_dim(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs) + + def test_ctc_loss_inference(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_ctc_loss(*config_and_inputs) + + def test_seq_classifier_loss_inference(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_seq_classifier_loss(*config_and_inputs) + + def test_ctc_train(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_ctc_training(*config_and_inputs) + + def test_seq_classifier_train(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_seq_classifier_training(*config_and_inputs) + + def test_xvector_train(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_xvector_training(*config_and_inputs) + + def test_labels_out_of_vocab(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_labels_out_of_vocab(*config_and_inputs) + + @unittest.skip(reason="Data2VecAudio has no inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="`input_ids` is renamed to `input_values`") + def test_forward_signature(self): + pass + + @unittest.skip(reason="Data2VecAudio has no tokens embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Data2VecAudio has no inputs_embeds") + def test_model_get_set_embeddings(self): + pass + + def test_retain_grad_hidden_states_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + # set layer drop to 0 + model.config.layerdrop = 0.0 + + input_values = inputs_dict["input_values"] + + input_lengths = torch.tensor( + [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device + ) + output_lengths = model._get_feat_extract_output_lengths(input_lengths) + + labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size) + inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"]) + inputs_dict["labels"] = labels + + outputs = model(**inputs_dict) + + output = outputs[0] + + # Encoder-/Decoder-only models + hidden_states = outputs.hidden_states[0] + attentions = outputs.attentions[0] + + hidden_states.retain_grad() + attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(hidden_states.grad) + self.assertIsNotNone(attentions.grad) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + uniform_init_parms = [ + "conv.weight", + "masked_spec_embed", + "codevectors", + "quantizer.weight_proj.weight", + "project_hid.weight", + "project_hid.bias", + "project_q.weight", + "project_q.bias", + "feature_projection.projection.weight", + "feature_projection.projection.bias", + "objective.weight", + ] + if param.requires_grad: + if any(x in name for x in uniform_init_parms): + self.assertTrue( + -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + # overwrite from test_modeling_common + def _mock_init_weights(self, module): + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(3) + if hasattr(module, "weight_g") and module.weight_g is not None: + module.weight_g.data.fill_(3) + if hasattr(module, "weight_v") and module.weight_v is not None: + module.weight_v.data.fill_(3) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.fill_(3) + if hasattr(module, "codevectors") and module.codevectors is not None: + module.codevectors.data.fill_(3) + if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None: + module.masked_spec_embed.data.fill_(3) + + def test_mask_feature_prob_ctc(self): + model = Data2VecAudioForCTC.from_pretrained( + "hf-internal-testing/tiny-random-data2vec-seq-class", mask_feature_prob=0.2, mask_feature_length=2 + ) + model.to(torch_device).train() + processor = Wav2Vec2Processor.from_pretrained( + "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True + ) + + batch_duration_in_seconds = [1, 3, 2, 6] + input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds] + + batch = processor( + input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt" + ) + + logits = model( + input_values=batch["input_values"].to(torch_device), + attention_mask=batch["attention_mask"].to(torch_device), + ).logits + + self.assertEqual(logits.shape, (4, 1498, 32)) + + def test_mask_time_prob_ctc(self): + model = Data2VecAudioForCTC.from_pretrained( + "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2 + ) + model.to(torch_device).train() + processor = Wav2Vec2Processor.from_pretrained( + "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True + ) + + batch_duration_in_seconds = [1, 3, 2, 6] + input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds] + + batch = processor( + input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt" + ) + + logits = model( + input_values=batch["input_values"].to(torch_device), + attention_mask=batch["attention_mask"].to(torch_device), + ).logits + + self.assertEqual(logits.shape, (4, 299, 32)) + + @unittest.skip(reason="Feed forward chunking is not implemented") + def test_feed_forward_chunking(self): + pass + + @slow + def test_model_from_pretrained(self): + model = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base") + self.assertIsNotNone(model) + + +@require_torch +class Data2VecAudioUtilsTest(unittest.TestCase): + def test_compute_mask_indices(self): + batch_size = 4 + sequence_length = 60 + mask_prob = 0.5 + mask_length = 1 + + mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length) + mask = torch.from_numpy(mask).to(torch_device) + + self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)]) + + def test_compute_mask_indices_low_prob(self): + # with these settings num_masked_spans=0.5, which means probabilistic rounding + # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in + # the other 5 out of 10, cases num_masked_spans=1 + n_trials = 100 + batch_size = 4 + sequence_length = 100 + mask_prob = 0.05 + mask_length = 10 + + count_dimensions_masked = 0 + count_dimensions_not_masked = 0 + + for _ in range(n_trials): + mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length) + mask = torch.from_numpy(mask).to(torch_device) + + num_masks = torch.sum(mask).item() + + if num_masks > 0: + count_dimensions_masked += 1 + else: + count_dimensions_not_masked += 1 + + # as we test for at least 10 masked dimension and at least + # 10 non-masked dimension, this test could fail with probability: + # P(100 coin flips, at most 9 heads) = 1.66e-18 + self.assertGreater(count_dimensions_masked, int(n_trials * 0.1)) + self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1)) + + def test_compute_mask_indices_overlap(self): + batch_size = 4 + sequence_length = 80 + mask_prob = 0.5 + mask_length = 4 + + mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length) + mask = torch.from_numpy(mask).to(torch_device) + + # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal + for batch_sum in mask.sum(axis=-1): + self.assertTrue(int(batch_sum) <= mask_prob * sequence_length) + + def test_compute_mask_indices_attn_mask_overlap(self): + batch_size = 4 + sequence_length = 80 + mask_prob = 0.5 + mask_length = 4 + + attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device) + attention_mask[:2, sequence_length // 2 :] = 0 + + mask = _compute_mask_indices( + (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask + ) + mask = torch.from_numpy(mask).to(torch_device) + + for batch_sum in mask.sum(axis=-1): + self.assertTrue(int(batch_sum) <= mask_prob * sequence_length) + + self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0) + + def test_compute_mask_indices_short_audio(self): + batch_size = 4 + sequence_length = 100 + mask_prob = 0.05 + mask_length = 10 + + attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device) + # force one example to be heavily padded + attention_mask[0, 5:] = 0 + + mask = _compute_mask_indices( + (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2 + ) + + # make sure that non-padded examples cannot be padded + self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any()) + + +@require_torch +@require_soundfile +@slow +class Data2VecAudioModelIntegrationTest(unittest.TestCase): + def _load_datasamples(self, num_samples): + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").filter( + lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] + )[:num_samples]["audio"] + + return [x["array"] for x in speech_samples] + + def _load_superb(self, task, num_samples): + ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True) + + return ds[:num_samples] + + def test_inference_ctc_normal(self): + model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h") + model.to(torch_device) + processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True) + input_speech = self._load_datasamples(1) + + input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device) + + with torch.no_grad(): + logits = model(input_values).logits + + predicted_ids = torch.argmax(logits, dim=-1) + predicted_trans = processor.batch_decode(predicted_ids) + + EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"] + self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS) + + def test_inference_ctc_batched(self): + model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h").to(torch_device) + processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True) + + input_speech = self._load_datasamples(4) + + inputs = processor(input_speech, return_tensors="pt", padding=True) + + input_values = inputs.input_values.to(torch_device) + + with torch.no_grad(): + logits = model(input_values).logits + + predicted_ids = torch.argmax(logits, dim=-1) + predicted_trans = processor.batch_decode(predicted_ids) + + EXPECTED_TRANSCRIPTIONS = [ + "a man said to the universe sir i exist", + "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", + "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around" + " him with thousands of spectators were trivialities not worth thinking about", + "his instant of panic was followed by a small sharp blow high on his chest", + ] + self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS) diff --git a/docs/transformers/tests/models/data2vec/test_modeling_data2vec_text.py b/docs/transformers/tests/models/data2vec/test_modeling_data2vec_text.py new file mode 100644 index 0000000000000000000000000000000000000000..acb18b3d8e89bb5f87bfda8aaa24a3fa926fca95 --- /dev/null +++ b/docs/transformers/tests/models/data2vec/test_modeling_data2vec_text.py @@ -0,0 +1,541 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Data2VecAudio model.""" + +import unittest + +from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask +from transformers import Data2VecTextConfig, is_torch_available +from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + Data2VecTextForCausalLM, + Data2VecTextForMaskedLM, + Data2VecTextForMultipleChoice, + Data2VecTextForQuestionAnswering, + Data2VecTextForSequenceClassification, + Data2VecTextForTokenClassification, + Data2VecTextModel, + ) + from transformers.models.data2vec.modeling_data2vec_text import ( + Data2VecTextForTextEmbeddings, + create_position_ids_from_input_ids, + ) + + +class Data2VecTextModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return Data2VecTextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + ) + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + config.is_decoder = True + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = Data2VecTextModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_model_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.add_cross_attention = True + model = Data2VecTextModel(config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_for_causal_lm( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + model = Data2VecTextForCausalLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_decoder_model_past_large_inputs( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.is_decoder = True + config.add_cross_attention = True + model = Data2VecTextForCausalLM(config=config).to(torch_device).eval() + + # make sure that ids don't start with pad token + mask = input_ids.ne(config.pad_token_id).long() + input_ids = input_ids * mask + + # first forward pass + outputs = model( + input_ids, + attention_mask=input_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=True, + ) + past_key_values = outputs.past_key_values + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + + # make sure that ids don't start with pad token + mask = next_tokens.ne(config.pad_token_id).long() + next_tokens = next_tokens * mask + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model( + next_input_ids, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_hidden_states=True, + )["hidden_states"][0] + output_from_past = model( + next_tokens, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_hidden_states=True, + )["hidden_states"][0] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = Data2VecTextForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = Data2VecTextForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = Data2VecTextForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = Data2VecTextForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + Data2VecTextForCausalLM, + Data2VecTextForMaskedLM, + Data2VecTextModel, + Data2VecTextForSequenceClassification, + Data2VecTextForTokenClassification, + Data2VecTextForMultipleChoice, + Data2VecTextForQuestionAnswering, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": Data2VecTextModel, + "fill-mask": Data2VecTextForMaskedLM, + "question-answering": Data2VecTextForQuestionAnswering, + "text-classification": Data2VecTextForSequenceClassification, + "text-generation": Data2VecTextForCausalLM, + "token-classification": Data2VecTextForTokenClassification, + "zero-shot": Data2VecTextForSequenceClassification, + } + if is_torch_available() + else {} + ) + model_split_percents = [0.5, 0.9] + + def setUp(self): + self.model_tester = Data2VecTextModelTester(self) + self.config_tester = ConfigTester(self, config_class=Data2VecTextConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_as_decoder(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) + + def test_model_as_decoder_with_default_input_mask(self): + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = self.model_tester.prepare_config_and_inputs_for_decoder() + + input_mask = None + + self.model_tester.create_and_check_model_as_decoder( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def test_for_causal_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_for_causal_lm(*config_and_inputs) + + def test_decoder_model_past_with_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) + + def test_decoder_model_past_with_large_inputs_relative_pos_emb(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + config_and_inputs[0].position_embedding_type = "relative_key" + self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/data2vec-text-base" + model = Data2VecTextModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_create_position_ids_respects_padding_index(self): + """This is a regression test for https://github.com/huggingface/transformers/issues/1761 + + The position ids should be masked with the embedding object's padding index. Therefore, the + first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1 + """ + config = self.model_tester.prepare_config_and_inputs()[0] + model = Data2VecTextForTextEmbeddings(config=config) + + input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]]) + expected_positions = torch.as_tensor( + [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]] + ) + + position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx) + self.assertEqual(position_ids.shape, expected_positions.shape) + self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) + + def test_create_position_ids_from_inputs_embeds(self): + """This is a regression test for https://github.com/huggingface/transformers/issues/1761 + + The position ids should be masked with the embedding object's padding index. Therefore, the + first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1 + """ + config = self.model_tester.prepare_config_and_inputs()[0] + embeddings = Data2VecTextForTextEmbeddings(config=config) + + inputs_embeds = torch.empty(2, 4, 30) + expected_single_positions = [ + 0 + embeddings.padding_idx + 1, + 1 + embeddings.padding_idx + 1, + 2 + embeddings.padding_idx + 1, + 3 + embeddings.padding_idx + 1, + ] + expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions]) + position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds) + self.assertEqual(position_ids.shape, expected_positions.shape) + self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) + + +@require_torch +class Data2VecTextModelIntegrationTest(TestCasePlus): + @slow + def test_inference_masked_lm(self): + model = Data2VecTextForMaskedLM.from_pretrained("facebook/data2vec-text-base") + + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + with torch.no_grad(): + output = model(input_ids)[0] + expected_shape = torch.Size((1, 11, 50265)) + self.assertEqual(output.shape, expected_shape) + # compare the actual values for a slice. + expected_slice = torch.tensor([[[0.2328, 0.0000, 1.1710], [2.2525, 0.0000, 1.9937], [2.1280, 0.0000, 1.8691]]]) + + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + @slow + def test_inference_no_head(self): + model = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base") + + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + with torch.no_grad(): + output = model(input_ids)[0] + # compare the actual values for a slice. + expected_slice = torch.tensor( + [[[0.1998, -0.0379, 0.0024], [-0.0971, -0.2214, -0.1798], [-0.0789, -0.2400, -0.1898]]] + ) + + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/data2vec/test_modeling_data2vec_vision.py b/docs/transformers/tests/models/data2vec/test_modeling_data2vec_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..69c48b07088da67da1f4fa6b7346d0d7abcf91ed --- /dev/null +++ b/docs/transformers/tests/models/data2vec/test_modeling_data2vec_vision.py @@ -0,0 +1,376 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Data2VecVision model.""" + +import unittest + +from transformers import Data2VecVisionConfig +from transformers.testing_utils import ( + require_torch, + require_torch_multi_gpu, + require_vision, + slow, + torch_device, +) +from transformers.utils import ( + cached_property, + is_torch_available, + is_vision_available, +) + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ( + Data2VecVisionForImageClassification, + Data2VecVisionForSemanticSegmentation, + Data2VecVisionModel, + ) + from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES + + +if is_vision_available(): + from PIL import Image + + from transformers import BeitImageProcessor + + +class Data2VecVisionModelTester: + def __init__( + self, + parent, + vocab_size=100, + batch_size=13, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + type_sequence_label_size=10, + initializer_range=0.02, + num_labels=3, + scope=None, + out_indices=[0, 1, 2, 3], + attn_implementation="eager", + mask_ratio=0.5, + ): + self.parent = parent + self.vocab_size = 100 + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.scope = scope + self.out_indices = out_indices + self.num_labels = num_labels + + # in BeiT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + self.mask_length = self.seq_length - 1 + self.num_masks = int(mask_ratio * self.seq_length) + self.attn_implementation = attn_implementation + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + pixel_labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels, pixel_labels + + def get_config(self): + return Data2VecVisionConfig( + vocab_size=self.vocab_size, + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + out_indices=self.out_indices, + attn_implementation=self.attn_implementation, + ) + + def create_and_check_model(self, config, pixel_values, labels, pixel_labels): + model = Data2VecVisionModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + num_patches = (self.image_size // self.patch_size) ** 2 + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels): + config.num_labels = self.type_sequence_label_size + model = Data2VecVisionForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def create_and_check_for_image_segmentation(self, config, pixel_values, labels, pixel_labels): + config.num_labels = self.num_labels + model = Data2VecVisionForSemanticSegmentation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2) + ) + result = model(pixel_values, labels=pixel_labels) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels, pixel_labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class Data2VecVisionModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Data2VecVision does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + (Data2VecVisionModel, Data2VecVisionForImageClassification, Data2VecVisionForSemanticSegmentation) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "image-feature-extraction": Data2VecVisionModel, + "image-classification": Data2VecVisionForImageClassification, + "image-segmentation": Data2VecVisionForSemanticSegmentation, + } + if is_torch_available() + else {} + ) + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = Data2VecVisionModelTester(self) + self.config_tester = ConfigTester( + self, config_class=Data2VecVisionConfig, has_text_modality=False, hidden_size=37 + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Data2VecVision does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @require_torch_multi_gpu + @unittest.skip( + reason="Data2VecVision has some layers using `add_module` which doesn't work well with `nn.DataParallel`" + ) + def test_multi_gpu_data_parallel_forward(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_segmentation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_segmentation(*config_and_inputs) + + def test_training(self): + if not self.model_tester.is_training: + self.skipTest(reason="model_tester.is_training is set to False") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + if model_class.__name__ in MODEL_MAPPING_NAMES.values(): + continue + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + if not self.model_tester.is_training: + self.skipTest(reason="model_tester.is_training is set to False") + + config.use_cache = False + config.return_dict = True + + for model_class in self.all_model_classes: + if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing: + continue + # TODO: remove the following 3 lines once we have a MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING + # this can then be incorporated into _prepare_for_class in test_modeling_common.py + elif model_class.__name__ == "Data2VecVisionForSemanticSegmentation": + batch_size, num_channels, height, width = inputs_dict["pixel_values"].shape + inputs_dict["labels"] = torch.zeros( + [self.model_tester.batch_size, height, width], device=torch_device + ).long() + model = model_class(config) + model.gradient_checkpointing_enable() + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + # we skip lambda parameters as these require special initial values + # determined by config.layer_scale_init_value + if "lambda" in name: + continue + if param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/data2vec-vision-base-ft1k" + model = Data2VecVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class Data2VecVisionModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k") if is_vision_available() else None + ) + + @slow + def test_inference_image_classification_head_imagenet_1k(self): + model = Data2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base-ft1k").to( + torch_device + ) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + logits = outputs.logits + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(logits.shape, expected_shape) + + expected_slice = torch.tensor([0.3277, -0.1395, 0.0911]).to(torch_device) + + torch.testing.assert_close(logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + expected_top2 = [model.config.label2id[i] for i in ["remote control, remote", "tabby, tabby cat"]] + self.assertEqual(logits[0].topk(2).indices.tolist(), expected_top2) + + @slow + def test_inference_interpolate_pos_encoding(self): + model_name = "facebook/data2vec-vision-base-ft1k" + model = Data2VecVisionModel.from_pretrained(model_name, **{"use_absolute_position_embeddings": True}).to( + torch_device + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + processor = BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k") + inputs = processor(images=image, return_tensors="pt", size={"height": 480, "width": 480}) + pixel_values = inputs.pixel_values.to(torch_device) + + # with interpolate_pos_encoding being True the model should process the higher resolution image + # successfully and produce the expected output. + with torch.no_grad(): + outputs = model(pixel_values, interpolate_pos_encoding=True) + + # num_cls_tokens + (height / patch_size) * (width / patch_size) + # 1 + (480 / 16) * (480 / 16) = 901 + expected_shape = torch.Size((1, 901, 768)) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) diff --git a/docs/transformers/tests/models/data2vec/test_modeling_tf_data2vec_vision.py b/docs/transformers/tests/models/data2vec/test_modeling_tf_data2vec_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..3f88801534027c6f7385c6e3793990f5f5b61b91 --- /dev/null +++ b/docs/transformers/tests/models/data2vec/test_modeling_tf_data2vec_vision.py @@ -0,0 +1,491 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the TensorFlow Data2VecVision model.""" + +from __future__ import annotations + +import collections.abc +import inspect +import unittest + +import numpy as np + +from transformers import Data2VecVisionConfig +from transformers.file_utils import cached_property, is_tf_available, is_vision_available +from transformers.testing_utils import require_tf, require_vision, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import ( + TFData2VecVisionForImageClassification, + TFData2VecVisionForSemanticSegmentation, + TFData2VecVisionModel, + ) + from transformers.modeling_tf_utils import keras + +if is_vision_available(): + from PIL import Image + + from transformers import BeitImageProcessor + + +class TFData2VecVisionModelTester: + def __init__( + self, + parent, + vocab_size=100, + batch_size=13, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + type_sequence_label_size=10, + initializer_range=0.02, + num_labels=3, + scope=None, + out_indices=[0, 1, 2, 3], + ): + self.parent = parent + self.vocab_size = 100 + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.scope = scope + self.out_indices = out_indices + self.num_labels = num_labels + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + pixel_labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels, pixel_labels + + def get_config(self): + return Data2VecVisionConfig( + vocab_size=self.vocab_size, + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + out_indices=self.out_indices, + ) + + def create_and_check_model(self, config, pixel_values, labels, pixel_labels): + model = TFData2VecVisionModel(config=config) + result = model(pixel_values, training=False) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = ( + self.image_size + if isinstance(self.image_size, collections.abc.Iterable) + else (self.image_size, self.image_size) + ) + patch_size = ( + self.patch_size + if isinstance(self.image_size, collections.abc.Iterable) + else (self.patch_size, self.patch_size) + ) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels): + config.num_labels = self.type_sequence_label_size + model = TFData2VecVisionForImageClassification(config) + + result = model(pixel_values, labels=labels, training=False) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def create_and_check_for_image_segmentation(self, config, pixel_values, labels, pixel_labels): + config.num_labels = self.num_labels + model = TFData2VecVisionForSemanticSegmentation(config) + result = model(pixel_values, training=False) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2) + ) + result = model(pixel_values, labels=pixel_labels) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels, pixel_labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + def prepare_config_and_inputs_for_keras_fit(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, _, _ = config_and_inputs + inputs_dict = {"pixel_values": pixel_values, "labels": tf.zeros(self.batch_size)} + return config, inputs_dict + + +@require_tf +class TFData2VecVisionModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Data2VecVision does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + (TFData2VecVisionModel, TFData2VecVisionForImageClassification, TFData2VecVisionForSemanticSegmentation) + if is_tf_available() + else () + ) + pipeline_model_mapping = ( + {"feature-extraction": TFData2VecVisionModel, "image-classification": TFData2VecVisionForImageClassification} + if is_tf_available() + else {} + ) + + test_pruning = False + test_onnx = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = TFData2VecVisionModelTester(self) + self.config_tester = ConfigTester( + self, config_class=Data2VecVisionConfig, has_text_modality=False, hidden_size=37 + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Data2VecVision does not use inputs_embeds") + def test_inputs_embeds(self): + # Data2VecVision does not use inputs_embeds + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, keras.layers.Layer)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_segmentation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_segmentation(*config_and_inputs) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + # in Data2VecVision, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token) + image_size = ( + self.model_tester.image_size + if isinstance(self.model_tester.image_size, collections.abc.Iterable) + else (self.model_tester.image_size, self.model_tester.image_size) + ) + patch_size = ( + self.model_tester.patch_size + if isinstance(self.model_tester.patch_size, collections.abc.Iterable) + else (self.model_tester.patch_size, self.model_tester.patch_size) + ) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_len = num_patches + 1 + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) + chunk_length = getattr(self.model_tester, "chunk_length", None) + if chunk_length is not None and hasattr(self.model_tester, "num_hashes"): + encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + + self.assertEqual(out_len + 1, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # Data2VecVision has a different seq_length + image_size = ( + self.model_tester.image_size + if isinstance(self.model_tester.image_size, collections.abc.Iterable) + else (self.model_tester.image_size, self.model_tester.image_size) + ) + patch_size = ( + self.model_tester.patch_size + if isinstance(self.model_tester.patch_size, collections.abc.Iterable) + else (self.model_tester.patch_size, self.model_tester.patch_size) + ) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_length = num_patches + 1 + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + # Overriding this method since the base method won't be compatible with Data2VecVision. + @slow + def test_keras_fit(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + # Since `TFData2VecVisionModel` cannot operate with the default `fit()` method. + if model_class.__name__ != "TFData2VecVisionModel": + model = model_class(config) + if getattr(model, "hf_compute_loss", None): + # Test that model correctly compute the loss with kwargs + _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit() + + label_names = {"labels"} + self.assertGreater(len(label_names), 0, msg="No matching label names found!") + labels = {key: val for key, val in prepared_for_class.items() if key in label_names} + inputs_minus_labels = { + key: val for key, val in prepared_for_class.items() if key not in label_names + } + self.assertGreater(len(inputs_minus_labels), 0) + model.compile(optimizer=keras.optimizers.SGD(0.0), run_eagerly=True) + + # Make sure the model fits without crashing regardless of where we pass the labels + history1 = model.fit( + prepared_for_class, + validation_data=prepared_for_class, + steps_per_epoch=1, + validation_steps=1, + shuffle=False, + ) + val_loss1 = history1.history["val_loss"][0] + history2 = model.fit( + inputs_minus_labels, + labels, + validation_data=(inputs_minus_labels, labels), + steps_per_epoch=1, + validation_steps=1, + shuffle=False, + ) + val_loss2 = history2.history["val_loss"][0] + self.assertTrue(np.allclose(val_loss1, val_loss2, atol=1e-2, rtol=1e-3)) + + # Overriding this method since the base method won't be compatible with Data2VecVision. + def test_loss_computation(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + # Since `TFData2VecVisionModel` won't have labels against which we + # could compute loss. + if model_class.__name__ != "TFData2VecVisionModel": + model = model_class(config) + if getattr(model, "hf_compute_loss", None): + # The number of elements in the loss should be the same as the number of elements in the label + _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit() + added_label = prepared_for_class[ + sorted(prepared_for_class.keys() - inputs_dict.keys(), reverse=True)[0] + ] + loss_size = tf.size(added_label) + + # Test that model correctly compute the loss with kwargs + possible_input_names = {"input_ids", "pixel_values", "input_features"} + input_name = possible_input_names.intersection(set(prepared_for_class)).pop() + model_input = prepared_for_class.pop(input_name) + + loss = model(model_input, **prepared_for_class)[0] + self.assertEqual(loss.shape, [loss_size]) + + # Test that model correctly compute the loss with a dict + _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit() + loss = model(**prepared_for_class)[0] + self.assertEqual(loss.shape, [loss_size]) + + # Test that model correctly compute the loss with a tuple + label_keys = prepared_for_class.keys() - inputs_dict.keys() + signature = inspect.signature(model.call).parameters + signature_names = list(signature.keys()) + + # Create a dictionary holding the location of the tensors in the tuple + tuple_index_mapping = {0: input_name} + for label_key in label_keys: + label_key_index = signature_names.index(label_key) + tuple_index_mapping[label_key_index] = label_key + sorted_tuple_index_mapping = sorted(tuple_index_mapping.items()) + # Initialize a list with their default values, update the values and convert to a tuple + list_input = [] + + for name in signature_names: + if name != "kwargs": + list_input.append(signature[name].default) + + for index, value in sorted_tuple_index_mapping: + list_input[index] = prepared_for_class[value] + + tuple_input = tuple(list_input) + + # Send to model + loss = model(tuple_input[:-1])[0] + + self.assertEqual(loss.shape, [loss_size]) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/data2vec-vision-base-ft1k" + model = TFData2VecVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_tf +@require_vision +class TFData2VecVisionModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k") if is_vision_available() else None + ) + + @slow + def test_inference_image_classification_head_imagenet_1k(self): + model = TFData2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base-ft1k") + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="tf") + + # forward pass + outputs = model(**inputs) + logits = outputs.logits + + # verify the logits + expected_shape = tf.convert_to_tensor([1, 1000]) + self.assertEqual(logits.shape, expected_shape) + + expected_slice = tf.convert_to_tensor([0.3277, -0.1395, 0.0911]) + + tf.debugging.assert_near(logits[0, :3], expected_slice, atol=1e-4) + + expected_top2 = [model.config.label2id[i] for i in ["remote control, remote", "tabby, tabby cat"]] + self.assertEqual(tf.nn.top_k(outputs.logits[0], 2).indices.numpy().tolist(), expected_top2) diff --git a/docs/transformers/tests/models/dbrx/__init__.py b/docs/transformers/tests/models/dbrx/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/dbrx/test_modeling_dbrx.py b/docs/transformers/tests/models/dbrx/test_modeling_dbrx.py new file mode 100644 index 0000000000000000000000000000000000000000..512bd6a02c08e5df71ac0403db8ca6e3185b9e6d --- /dev/null +++ b/docs/transformers/tests/models/dbrx/test_modeling_dbrx.py @@ -0,0 +1,279 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DBRX model.""" + +import unittest + +from transformers import DbrxConfig, is_torch_available +from transformers.testing_utils import require_torch, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import DbrxForCausalLM, DbrxModel + + +class DbrxModelTester: + def __init__( + self, + parent, + hidden_size=32, + ffn_hidden_size=32, + num_attention_heads=4, + kv_n_heads=4, + num_hidden_layers=5, + max_position_embeddings=512, + type_vocab_size=16, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + use_cache=True, + type_sequence_label_size=2, + num_labels=3, + num_choices=4, + scope=None, + clip_qkv=8, + rope_theta=500000, + attn_config_model_type="", + emb_pdrop=0.0, + moe_jitter_eps=0, + moe_loss_weight=0.05, + moe_num_experts=16, + moe_top_k=4, + ffn_config_model_type="", + ffn_act_fn_name="gelu", + initializer_range=0.02, + output_router_logits=False, + resid_pdrop=0.0, + tie_word_embeddings=False, + torch_dtype="bfloat16", + vocab_size=99, + is_decoder=True, + pad_token_id=0, + ): + # Parameters unique to testing + self.batch_size = batch_size + self.seq_length = seq_length + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.parent = parent + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + + # attn_config params + self.clip_qkv = clip_qkv + self.kv_n_heads = kv_n_heads + self.rope_theta = rope_theta + self.attn_config_model_type = attn_config_model_type + + # ffn_config params + self.ffn_hidden_size = ffn_hidden_size + self.moe_jitter_eps = moe_jitter_eps + self.moe_loss_weight = moe_loss_weight + self.moe_num_experts = moe_num_experts + self.moe_top_k = moe_top_k + self.ffn_config_model_type = ffn_config_model_type + self.ffn_act_fn_name = ffn_act_fn_name + + # Other model params + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.vocab_size = vocab_size + self.use_cache = use_cache + self.initializer_range = initializer_range + self.emb_pdrop = emb_pdrop + self.output_router_logits = output_router_logits + self.resid_pdrop = resid_pdrop + self.tie_word_embeddings = tie_word_embeddings + self.torch_dtype = torch_dtype + self.is_decoder = is_decoder + self.pad_token_id = pad_token_id + + # Make the dictionaries + self.ffn_config = { + "ffn_hidden_size": self.ffn_hidden_size, + "moe_jitter_eps": self.moe_jitter_eps, + "moe_loss_weight": self.moe_loss_weight, + "moe_num_experts": self.moe_num_experts, + "moe_top_k": self.moe_top_k, + "model_type": self.ffn_config_model_type, + "ffn_act_fn": {"name": self.ffn_act_fn_name}, + } + self.attn_config = { + "clip_qkv": self.clip_qkv, + "kv_n_heads": self.kv_n_heads, + "model_type": self.attn_config_model_type, + "rope_theta": self.rope_theta, + } + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + # Behind the scenes, `DbrxConfig` maps the parameters `hidden_size`, `num_hidden_layers`, + # `num_attention_heads`, `max_position_embeddings` to the parameters `d_model`, `n_layers`, + # `n_heads`, `max_seq_len` respectively. We use the first group of parameters because + # other tests expect every model to have these parameters with these specific names. + config = DbrxConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, # mapped to `d_model` + num_hidden_layers=self.num_hidden_layers, # mapped to `n_layers` + num_attention_heads=self.num_attention_heads, # mapped to `n_heads` + max_position_embeddings=self.max_position_embeddings, # mapped to `max_seq_len` + attn_config=self.attn_config, + ffn_config=self.ffn_config, + resid_pdrop=self.resid_pdrop, + emb_pdrop=self.emb_pdrop, + use_cache=self.use_cache, + initializer_range=self.initializer_range, + output_router_logits=self.output_router_logits, + is_decoder=self.is_decoder, + pad_token_id=self.pad_token_id, + ) + return config + + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Dbrx + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DbrxModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Dbrx + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class DbrxModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (DbrxModel, DbrxForCausalLM) if is_torch_available() else () + pipeline_model_mapping = {"text-generation": DbrxForCausalLM} if is_torch_available() else {} + test_headmasking = False + test_pruning = False + + def setUp(self): + self.model_tester = DbrxModelTester(self) + self.config_tester = ConfigTester(self, config_class=DbrxConfig, d_model=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "eitanturok/dbrx-tiny" + model = DbrxModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip(reason="Dbrx models have weight tying disabled.") + def test_tied_weights_keys(self): + pass + + # Offload does not work with Dbrx models because of the forward of DbrxExperts where we chunk the experts. + # The issue is that the offloaded weights of the mlp layer are still on meta device (w1_chunked, v1_chunked, w2_chunked) + @unittest.skip(reason="Dbrx models do not work with offload") + def test_cpu_offload(self): + pass + + @unittest.skip(reason="Dbrx models do not work with offload") + def test_disk_offload_safetensors(self): + pass + + @unittest.skip(reason="Dbrx models do not work with offload") + def test_disk_offload_bin(self): + pass + + +@require_torch +class DbrxModelIntegrationTest(unittest.TestCase): + @slow + def test_tiny_model_logits(self): + model = DbrxForCausalLM.from_pretrained("Rocketknight1/dbrx-tiny-random") + input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]]) + output = model(input_ids)[0] + vocab_size = model.vocab_size + + expected_shape = torch.Size((1, 6, vocab_size)) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor( + [ + [ + [-1.6300e-04, 5.0118e-04, 2.5437e-04], + [2.0422e-05, 2.7210e-04, -1.5125e-04], + [-1.5105e-04, 4.6879e-04, 3.3309e-04], + ] + ] + ) + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/deberta/__init__.py b/docs/transformers/tests/models/deberta/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/deberta/test_modeling_deberta.py b/docs/transformers/tests/models/deberta/test_modeling_deberta.py new file mode 100644 index 0000000000000000000000000000000000000000..8ec4eefc4f3f44e508ed2d27a3e254a246c8a57f --- /dev/null +++ b/docs/transformers/tests/models/deberta/test_modeling_deberta.py @@ -0,0 +1,308 @@ +# Copyright 2018 Microsoft Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +from transformers import DebertaConfig, is_torch_available +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + DebertaForMaskedLM, + DebertaForQuestionAnswering, + DebertaForSequenceClassification, + DebertaForTokenClassification, + DebertaModel, + ) + + +class DebertaModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + relative_attention=False, + position_biased_input=True, + pos_att_type="None", + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.relative_attention = relative_attention + self.position_biased_input = position_biased_input + self.pos_att_type = pos_att_type + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return DebertaConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + relative_attention=self.relative_attention, + position_biased_input=self.position_biased_input, + pos_att_type=self.pos_att_type, + ) + + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + + def check_loss_output(self, result): + self.parent.assertListEqual(list(result.loss.size()), []) + + def create_and_check_deberta_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DebertaModel(config=config) + model.to(torch_device) + model.eval() + sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0] + sequence_output = model(input_ids, token_type_ids=token_type_ids)[0] + sequence_output = model(input_ids)[0] + + self.parent.assertListEqual(list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size]) + + def create_and_check_deberta_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DebertaForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_deberta_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = DebertaForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels]) + self.check_loss_output(result) + + def create_and_check_deberta_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = DebertaForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_deberta_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DebertaForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class DebertaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + DebertaModel, + DebertaForMaskedLM, + DebertaForSequenceClassification, + DebertaForTokenClassification, + DebertaForQuestionAnswering, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": DebertaModel, + "fill-mask": DebertaForMaskedLM, + "question-answering": DebertaForQuestionAnswering, + "text-classification": DebertaForSequenceClassification, + "token-classification": DebertaForTokenClassification, + "zero-shot": DebertaForSequenceClassification, + } + if is_torch_available() + else {} + ) + + fx_compatible = True + test_torchscript = False + test_pruning = False + test_head_masking = False + is_encoder_decoder = False + + def setUp(self): + self.model_tester = DebertaModelTester(self) + self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_deberta_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_model(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_sequence_classification(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_masked_lm(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_question_answering(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "microsoft/deberta-base" + model = DebertaModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker") + def test_torch_fx_output_loss(self): + pass + + @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker") + def test_torch_fx(self): + pass + + +@require_torch +@require_sentencepiece +@require_tokenizers +class DebertaModelIntegrationTest(unittest.TestCase): + @unittest.skip(reason="Model not available yet") + def test_inference_masked_lm(self): + pass + + @slow + def test_inference_no_head(self): + model = DebertaModel.from_pretrained("microsoft/deberta-base") + + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + with torch.no_grad(): + output = model(input_ids, attention_mask=attention_mask)[0] + # compare the actual values for a slice. + expected_slice = torch.tensor( + [[[-0.5986, -0.8055, -0.8462], [1.4484, -0.9348, -0.8059], [0.3123, 0.0032, -1.4131]]] + ) + torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/deberta/test_modeling_tf_deberta.py b/docs/transformers/tests/models/deberta/test_modeling_tf_deberta.py new file mode 100644 index 0000000000000000000000000000000000000000..ea1e716dd66c346e12006c051501f00fc2f84062 --- /dev/null +++ b/docs/transformers/tests/models/deberta/test_modeling_tf_deberta.py @@ -0,0 +1,295 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import unittest + +from transformers import DebertaConfig, is_tf_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import ( + TFDebertaForMaskedLM, + TFDebertaForQuestionAnswering, + TFDebertaForSequenceClassification, + TFDebertaForTokenClassification, + TFDebertaModel, + ) + + +class TFDebertaModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_input_mask = True + self.use_token_type_ids = True + self.use_labels = True + self.vocab_size = 99 + self.hidden_size = 32 + self.num_hidden_layers = 2 + self.num_attention_heads = 4 + self.intermediate_size = 37 + self.hidden_act = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.relative_attention = False + self.max_relative_positions = -1 + self.position_biased_input = True + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.scope = None + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + + config = DebertaConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + relative_attention=self.relative_attention, + max_relative_positions=self.max_relative_positions, + position_biased_input=self.position_biased_input, + initializer_range=self.initializer_range, + return_dict=True, + ) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDebertaModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + + inputs = [input_ids, input_mask] + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDebertaForMaskedLM(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFDebertaForSequenceClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFDebertaForTokenClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDebertaForQuestionAnswering(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFDebertaModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + TFDebertaModel, + TFDebertaForMaskedLM, + TFDebertaForQuestionAnswering, + TFDebertaForSequenceClassification, + TFDebertaForTokenClassification, + ) + if is_tf_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": TFDebertaModel, + "fill-mask": TFDebertaForMaskedLM, + "question-answering": TFDebertaForQuestionAnswering, + "text-classification": TFDebertaForSequenceClassification, + "token-classification": TFDebertaForTokenClassification, + "zero-shot": TFDebertaForSequenceClassification, + } + if is_tf_available() + else {} + ) + + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFDebertaModelTester(self) + self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model = TFDebertaModel.from_pretrained("kamalkraj/deberta-base") + self.assertIsNotNone(model) + + +@require_tf +class TFDeBERTaModelIntegrationTest(unittest.TestCase): + @unittest.skip(reason="Model not available yet") + def test_inference_masked_lm(self): + pass + + @slow + def test_inference_no_head(self): + model = TFDebertaModel.from_pretrained("kamalkraj/deberta-base") + input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + attention_mask = tf.constant([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + output = model(input_ids, attention_mask=attention_mask)[0] + + expected_slice = tf.constant( + [ + [ + [-0.59855896, -0.80552566, -0.8462135], + [1.4484025, -0.93483794, -0.80593085], + [0.3122741, 0.00316059, -1.4131377], + ] + ] + ) + tf.debugging.assert_near(output[:, 1:4, 1:4], expected_slice, atol=1e-4) diff --git a/docs/transformers/tests/models/deberta/test_tokenization_deberta.py b/docs/transformers/tests/models/deberta/test_tokenization_deberta.py new file mode 100644 index 0000000000000000000000000000000000000000..ad625eabf0e9967375aff67419ee07dab00407c8 --- /dev/null +++ b/docs/transformers/tests/models/deberta/test_tokenization_deberta.py @@ -0,0 +1,173 @@ +# Copyright 2019 Hugging Face inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import unittest +from functools import lru_cache + +from transformers import DebertaTokenizer, DebertaTokenizerFast +from transformers.models.deberta.tokenization_deberta import VOCAB_FILES_NAMES +from transformers.testing_utils import slow + +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible + + +class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/deberta-base" + tokenizer_class = DebertaTokenizer + test_rust_tokenizer = True + rust_tokenizer_class = DebertaTokenizerFast + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt + vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "\u0120", + "\u0120l", + "\u0120n", + "\u0120lo", + "\u0120low", + "er", + "\u0120lowest", + "\u0120newer", + "\u0120wider", + "[UNK]", + ] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] + cls.special_tokens_map = {"unk_token": "[UNK]"} + + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + with open(cls.merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + def get_input_output_texts(self, tokenizer): + input_text = "lower newer" + output_text = "lower newer" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = self.get_tokenizer() + text = "lower newer" + bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"] + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [tokenizer.unk_token] + input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + def test_token_type_ids(self): + tokenizer = self.get_tokenizer() + tokd = tokenizer("Hello", "World") + expected_token_type_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] + self.assertListEqual(tokd["token_type_ids"], expected_token_type_ids) + + @slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained("microsoft/deberta-base") + + text = tokenizer.encode("sequence builders", add_special_tokens=False) + text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) + + encoded_text_from_decode = tokenizer.encode( + "sequence builders", add_special_tokens=True, add_prefix_space=False + ) + encoded_pair_from_decode = tokenizer.encode( + "sequence builders", "multi-sequence build", add_special_tokens=True, add_prefix_space=False + ) + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_sentence == encoded_text_from_decode + assert encoded_pair == encoded_pair_from_decode + + @slow + def test_tokenizer_integration(self): + tokenizer_classes = [self.tokenizer_class] + if self.test_rust_tokenizer: + tokenizer_classes.append(self.rust_tokenizer_class) + + for tokenizer_class in tokenizer_classes: + tokenizer = tokenizer_class.from_pretrained("microsoft/deberta-base") + + sequences = [ + "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations", + "ALBERT incorporates two parameter reduction techniques", + "The first one is a factorized embedding parameterization. By decomposing the large vocabulary" + " embedding matrix into two small matrices, we separate the size of the hidden layers from the size of" + " vocabulary embedding.", + ] + + encoding = tokenizer(sequences, padding=True) + decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]] + + # fmt: off + expected_encoding = { + 'input_ids': [ + [1, 2118, 11126, 565, 35, 83, 25191, 163, 18854, 13, 12156, 12, 16101, 25376, 13807, 9, 22205, 27893, 1635, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 2118, 11126, 565, 24536, 80, 43797, 4878, 7373, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 133, 78, 65, 16, 10, 3724, 1538, 33183, 11303, 43797, 1938, 4, 870, 24165, 29105, 5, 739, 32644, 33183, 11303, 36173, 88, 80, 650, 7821, 45940, 6, 52, 2559, 5, 1836, 9, 5, 7397, 13171, 31, 5, 1836, 9, 32644, 33183, 11303, 4, 2] + ], + 'token_type_ids': [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ], + 'attention_mask': [ + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + ] + } + # fmt: on + + expected_decoded_sequence = [ + "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations", + "ALBERT incorporates two parameter reduction techniques", + "The first one is a factorized embedding parameterization. By decomposing the large vocabulary" + " embedding matrix into two small matrices, we separate the size of the hidden layers from the size of" + " vocabulary embedding.", + ] + + self.assertDictEqual(encoding.data, expected_encoding) + + for expected, decoded in zip(expected_decoded_sequence, decoded_sequences): + self.assertEqual(expected, decoded) diff --git a/docs/transformers/tests/models/deberta_v2/__init__.py b/docs/transformers/tests/models/deberta_v2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/deberta_v2/test_modeling_deberta_v2.py b/docs/transformers/tests/models/deberta_v2/test_modeling_deberta_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..6de08d3f4bd7a21a288171ed9814110b92775f29 --- /dev/null +++ b/docs/transformers/tests/models/deberta_v2/test_modeling_deberta_v2.py @@ -0,0 +1,326 @@ +# Copyright 2018 Microsoft Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +from transformers import DebertaV2Config, is_torch_available +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + DebertaV2ForMaskedLM, + DebertaV2ForMultipleChoice, + DebertaV2ForQuestionAnswering, + DebertaV2ForSequenceClassification, + DebertaV2ForTokenClassification, + DebertaV2Model, + ) + + +class DebertaV2ModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + relative_attention=False, + position_biased_input=True, + pos_att_type="None", + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.relative_attention = relative_attention + self.position_biased_input = position_biased_input + self.pos_att_type = pos_att_type + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return DebertaV2Config( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + relative_attention=self.relative_attention, + position_biased_input=self.position_biased_input, + pos_att_type=self.pos_att_type, + ) + + def check_loss_output(self, result): + self.parent.assertListEqual(list(result.loss.size()), []) + + def create_and_check_deberta_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DebertaV2Model(config=config) + model.to(torch_device) + model.eval() + sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0] + sequence_output = model(input_ids, token_type_ids=token_type_ids)[0] + sequence_output = model(input_ids)[0] + + self.parent.assertListEqual(list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size]) + + def create_and_check_deberta_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DebertaV2ForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_deberta_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = DebertaV2ForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels]) + self.check_loss_output(result) + + def create_and_check_deberta_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = DebertaV2ForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_deberta_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DebertaV2ForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_deberta_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DebertaV2ForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class DebertaV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + DebertaV2Model, + DebertaV2ForMaskedLM, + DebertaV2ForSequenceClassification, + DebertaV2ForTokenClassification, + DebertaV2ForQuestionAnswering, + DebertaV2ForMultipleChoice, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": DebertaV2Model, + "fill-mask": DebertaV2ForMaskedLM, + "question-answering": DebertaV2ForQuestionAnswering, + "text-classification": DebertaV2ForSequenceClassification, + "token-classification": DebertaV2ForTokenClassification, + "zero-shot": DebertaV2ForSequenceClassification, + } + if is_torch_available() + else {} + ) + + fx_compatible = True + test_torchscript = False + test_pruning = False + test_head_masking = False + is_encoder_decoder = False + + def setUp(self): + self.model_tester = DebertaV2ModelTester(self) + self.config_tester = ConfigTester(self, config_class=DebertaV2Config, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_deberta_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_model(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_sequence_classification(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_masked_lm(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_question_answering(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_token_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_multiple_choice(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "microsoft/deberta-v2-xlarge" + model = DebertaV2Model.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker") + def test_torch_fx_output_loss(self): + pass + + @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker") + def test_torch_fx(self): + pass + + +@require_torch +@require_sentencepiece +@require_tokenizers +class DebertaV2ModelIntegrationTest(unittest.TestCase): + @unittest.skip(reason="Model not available yet") + def test_inference_masked_lm(self): + pass + + @slow + def test_inference_no_head(self): + model = DebertaV2Model.from_pretrained("microsoft/deberta-v2-xlarge") + + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + with torch.no_grad(): + output = model(input_ids, attention_mask=attention_mask)[0] + # compare the actual values for a slice. + expected_slice = torch.tensor( + [[[0.2356, 0.1948, 0.0369], [-0.1063, 0.3586, -0.5152], [-0.6399, -0.0259, -0.2525]]] + ) + torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py b/docs/transformers/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..b69e2eb489880ad7823151f01e52d8389affe26e --- /dev/null +++ b/docs/transformers/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py @@ -0,0 +1,309 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import unittest + +from transformers import DebertaV2Config, is_tf_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import ( + TFDebertaV2ForMaskedLM, + TFDebertaV2ForMultipleChoice, + TFDebertaV2ForQuestionAnswering, + TFDebertaV2ForSequenceClassification, + TFDebertaV2ForTokenClassification, + TFDebertaV2Model, + ) + + +class TFDebertaV2ModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + relative_attention=False, + position_biased_input=True, + pos_att_type="None", + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.relative_attention = relative_attention + self.position_biased_input = position_biased_input + self.pos_att_type = pos_att_type + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + + config = DebertaV2Config( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + relative_attention=self.relative_attention, + position_biased_input=self.position_biased_input, + initializer_range=self.initializer_range, + return_dict=True, + ) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDebertaV2Model(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + + inputs = [input_ids, input_mask] + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDebertaV2ForMaskedLM(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFDebertaV2ForSequenceClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFDebertaV2ForTokenClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDebertaV2ForQuestionAnswering(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = TFDebertaV2ForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFDebertaModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + TFDebertaV2Model, + TFDebertaV2ForMaskedLM, + TFDebertaV2ForQuestionAnswering, + TFDebertaV2ForMultipleChoice, + TFDebertaV2ForSequenceClassification, + TFDebertaV2ForTokenClassification, + ) + if is_tf_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": TFDebertaV2Model, + "fill-mask": TFDebertaV2ForMaskedLM, + "question-answering": TFDebertaV2ForQuestionAnswering, + "text-classification": TFDebertaV2ForSequenceClassification, + "token-classification": TFDebertaV2ForTokenClassification, + "zero-shot": TFDebertaV2ForSequenceClassification, + } + if is_tf_available() + else {} + ) + + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFDebertaV2ModelTester(self) + self.config_tester = ConfigTester(self, config_class=DebertaV2Config, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model = TFDebertaV2Model.from_pretrained("kamalkraj/deberta-v2-xlarge") + self.assertIsNotNone(model) + + +@require_tf +class TFDeBERTaV2ModelIntegrationTest(unittest.TestCase): + @unittest.skip(reason="Model not available yet") + def test_inference_masked_lm(self): + pass + + @slow + def test_inference_no_head(self): + model = TFDebertaV2Model.from_pretrained("kamalkraj/deberta-v2-xlarge") + input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + attention_mask = tf.constant([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + output = model(input_ids, attention_mask=attention_mask)[0] + + expected_slice = tf.constant( + [[[0.2356, 0.1948, 0.0369], [-0.1063, 0.3586, -0.5152], [-0.6399, -0.0259, -0.2525]]] + ) + tf.debugging.assert_near(output[:, 1:4, 1:4], expected_slice, atol=1e-4) diff --git a/docs/transformers/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/docs/transformers/tests/models/deberta_v2/test_tokenization_deberta_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..e629f279249c4083f8d6c3e1279a0810fc63a3bc --- /dev/null +++ b/docs/transformers/tests/models/deberta_v2/test_tokenization_deberta_v2.py @@ -0,0 +1,262 @@ +# Copyright 2019 Hugging Face inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers import DebertaV2Tokenizer, DebertaV2TokenizerFast +from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow + +from ...test_tokenization_common import TokenizerTesterMixin + + +SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") + + +@require_sentencepiece +@require_tokenizers +class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/deberta-v2-xlarge" + tokenizer_class = DebertaV2Tokenizer + rust_tokenizer_class = DebertaV2TokenizerFast + test_sentencepiece = True + test_sentencepiece_ignore_case = True + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # We have a SentencePiece fixture for testing + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="") + tokenizer.save_pretrained(cls.tmpdirname) + + def get_input_output_texts(self, tokenizer): + input_text = "this is a test" + output_text = "this is a test" + return input_text, output_text + + def test_convert_token_and_id(self): + """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" + token = "" + token_id = 0 + + self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id) + self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token) + + def test_get_vocab(self): + vocab_keys = list(self.get_tokenizer().get_vocab().keys()) + self.assertEqual(vocab_keys[0], "") + self.assertEqual(vocab_keys[1], "") + self.assertEqual(vocab_keys[-1], "[PAD]") + self.assertEqual(len(vocab_keys), 30_001) + + def test_vocab_size(self): + self.assertEqual(self.get_tokenizer().vocab_size, 30_000) + + def test_do_lower_case(self): + # fmt: off + sequence = " \tHeLLo!how \n Are yoU? " + tokens_target = ["▁hello", "!", "how", "▁are", "▁you", "?"] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", do_lower_case=True) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(rust_tokens, tokens_target) + + @unittest.skip(reason="There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.") + def test_sentencepiece_tokenize_and_convert_tokens_to_string(self): + pass + + @unittest.skip(reason="There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.") + def test_sentencepiece_tokenize_and_decode(self): + pass + + def test_split_by_punct(self): + # fmt: off + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", split_by_punct=True) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", split_by_punct=True) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(rust_tokens, tokens_target) + + def test_do_lower_case_split_by_punct(self): + # fmt: off + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=True) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast( + SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=True + ) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + self.assertListEqual(rust_tokens, tokens_target) + + def test_do_lower_case_split_by_punct_false(self): + # fmt: off + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "!", ] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=False) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast( + SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=False + ) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(rust_tokens, tokens_target) + + def test_do_lower_case_false_split_by_punct(self): + # fmt: off + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=True) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast( + SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=True + ) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(rust_tokens, tokens_target) + + def test_do_lower_case_false_split_by_punct_false(self): + # fmt: off + sequence = " \tHeLLo!how \n Are yoU? " + tokens_target = ["▁", "", "e", "", "o", "!", "how", "▁", "", "re", "▁yo", "", "?"] + # fmt: on + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=False) + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(tokens, tokens_target) + + rust_tokenizer = DebertaV2TokenizerFast( + SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=False + ) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + + self.assertListEqual(rust_tokens, tokens_target) + + def test_rust_and_python_full_tokenizers(self): + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + sequence = "I was born in 92000, and this is falsé!" + + tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) + rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) + self.assertListEqual(tokens, rust_tokens) + + ids = tokenizer.encode(sequence, add_special_tokens=False) + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + rust_tokenizer = self.get_rust_tokenizer() + ids = tokenizer.encode(sequence) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + def test_full_tokenizer(self): + sequence = "This is a test" + ids_target = [13, 1, 4398, 25, 21, 1289] + tokens_target = ["▁", "T", "his", "▁is", "▁a", "▁test"] + back_tokens_target = ["▁", "", "his", "▁is", "▁a", "▁test"] + + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", keep_accents=True) + rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", keep_accents=True) + + ids = tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, ids_target) + tokens = tokenizer.tokenize(sequence) + self.assertListEqual(tokens, tokens_target) + back_tokens = tokenizer.convert_ids_to_tokens(ids) + self.assertListEqual(back_tokens, back_tokens_target) + + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(rust_ids, ids_target) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(rust_tokens, tokens_target) + rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids) + self.assertListEqual(rust_back_tokens, back_tokens_target) + + # fmt: off + sequence = "I was born in 92000, and this is falsé!" + ids_target = [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 187] + tokens_target = ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "!", ] + back_tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "!", ] + # fmt: on + + ids = tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, ids_target) + tokens = tokenizer.tokenize(sequence) + self.assertListEqual(tokens, tokens_target) + back_tokens = tokenizer.convert_ids_to_tokens(ids) + self.assertListEqual(back_tokens, back_tokens_target) + + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(rust_ids, ids_target) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(rust_tokens, tokens_target) + rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids) + self.assertListEqual(rust_back_tokens, back_tokens_target) + + def test_sequence_builders(self): + tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB) + + text = tokenizer.encode("sequence builders") + text_2 = tokenizer.encode("multi-sequence build") + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + self.assertEqual([tokenizer.cls_token_id] + text + [tokenizer.sep_token_id], encoded_sentence) + self.assertEqual( + [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [tokenizer.sep_token_id], + encoded_pair, + ) + + @slow + def test_tokenizer_integration(self): + expected_encoding = {'input_ids': [[1, 39867, 36, 19390, 486, 27, 35052, 81436, 18, 60685, 1225, 7, 35052, 81436, 18, 9367, 16899, 18, 15937, 53, 594, 773, 18, 16287, 30465, 36, 15937, 6, 41139, 38, 36979, 60763, 191, 6, 34132, 99, 6, 50538, 390, 43230, 6, 34132, 2779, 20850, 14, 699, 1072, 1194, 36, 382, 10901, 53, 7, 699, 1072, 2084, 36, 20422, 630, 53, 19, 105, 3049, 1896, 1053, 16899, 1506, 11, 37978, 4243, 7, 1237, 31869, 200, 16566, 654, 6, 35052, 81436, 7, 55630, 13593, 4, 2], [1, 26, 15011, 13, 667, 8, 1053, 18, 23611, 1237, 72356, 12820, 34, 104134, 1209, 35, 13313, 6627, 21, 202, 347, 7, 164, 2399, 11, 46, 4485, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 5, 1232, 2864, 15785, 14951, 105, 5, 8581, 1250, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip + + self.tokenizer_integration_test_util( + expected_encoding=expected_encoding, + model_name="microsoft/deberta-v2-xlarge", + revision="ad6e42c1532ddf3a15c39246b63f5559d558b670", + ) diff --git a/docs/transformers/tests/models/decision_transformer/__init__.py b/docs/transformers/tests/models/decision_transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/decision_transformer/test_modeling_decision_transformer.py b/docs/transformers/tests/models/decision_transformer/test_modeling_decision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b6bdeaa0837dbe1a73a097d8f92e83266bd064f4 --- /dev/null +++ b/docs/transformers/tests/models/decision_transformer/test_modeling_decision_transformer.py @@ -0,0 +1,245 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DecisionTransformer model.""" + +import inspect +import unittest + +from transformers import DecisionTransformerConfig, is_torch_available +from transformers.testing_utils import require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import DecisionTransformerModel + + +class DecisionTransformerModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + act_dim=6, + state_dim=17, + hidden_size=23, + is_training=True, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.act_dim = act_dim + self.state_dim = state_dim + self.hidden_size = hidden_size + self.is_training = is_training + + def prepare_config_and_inputs(self): + states = floats_tensor((self.batch_size, self.seq_length, self.state_dim)) + actions = floats_tensor((self.batch_size, self.seq_length, self.act_dim)) + rewards = floats_tensor((self.batch_size, self.seq_length, 1)) + returns_to_go = floats_tensor((self.batch_size, self.seq_length, 1)) + timesteps = ids_tensor((self.batch_size, self.seq_length), vocab_size=1000) + attention_mask = random_attention_mask((self.batch_size, self.seq_length)) + + config = self.get_config() + + return ( + config, + states, + actions, + rewards, + returns_to_go, + timesteps, + attention_mask, + ) + + def get_config(self): + return DecisionTransformerConfig( + batch_size=self.batch_size, + seq_length=self.seq_length, + act_dim=self.act_dim, + state_dim=self.state_dim, + hidden_size=self.hidden_size, + ) + + def create_and_check_model( + self, + config, + states, + actions, + rewards, + returns_to_go, + timesteps, + attention_mask, + ): + model = DecisionTransformerModel(config=config) + model.to(torch_device) + model.eval() + result = model(states, actions, rewards, returns_to_go, timesteps, attention_mask) + + self.parent.assertEqual(result.state_preds.shape, states.shape) + self.parent.assertEqual(result.action_preds.shape, actions.shape) + self.parent.assertEqual(result.return_preds.shape, returns_to_go.shape) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.seq_length * 3, self.hidden_size) + ) # seq length *3 as there are 3 modelities: states, returns and actions + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + states, + actions, + rewards, + returns_to_go, + timesteps, + attention_mask, + ) = config_and_inputs + inputs_dict = { + "states": states, + "actions": actions, + "rewards": rewards, + "returns_to_go": returns_to_go, + "timesteps": timesteps, + "attention_mask": attention_mask, + } + return config, inputs_dict + + +@require_torch +class DecisionTransformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (DecisionTransformerModel,) if is_torch_available() else () + pipeline_model_mapping = {"feature-extraction": DecisionTransformerModel} if is_torch_available() else {} + + # Ignoring of a failing test from GenerationTesterMixin, as the model does not use inputs_ids + test_generate_without_input_ids = False + + # Ignoring of a failing tests from ModelTesterMixin, as the model does not implement these features + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_attention_outputs = False + test_hidden_states_output = False + test_inputs_embeds = False + test_gradient_checkpointing = False + test_torchscript = False + + def setUp(self): + self.model_tester = DecisionTransformerModelTester(self) + self.config_tester = ConfigTester(self, config_class=DecisionTransformerConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "edbeeching/decision-transformer-gym-hopper-medium" + model = DecisionTransformerModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = [ + "states", + "actions", + "rewards", + "returns_to_go", + "timesteps", + "attention_mask", + ] + + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + @unittest.skip(reason="Model does not have input embeddings") + def test_model_get_set_embeddings(self): + pass + + +@require_torch +class DecisionTransformerModelIntegrationTest(unittest.TestCase): + @slow + def test_autoregressive_prediction(self): + """ + An integration test that performs autoregressive prediction of state, action and return + from a sequence of state, actions and returns. Test is performed over two timesteps. + + """ + + NUM_STEPS = 2 # number of steps of autoregressive prediction we will perform + TARGET_RETURN = 10 # defined by the RL environment, may be normalized + model = DecisionTransformerModel.from_pretrained("edbeeching/decision-transformer-gym-hopper-expert") + model = model.to(torch_device) + config = model.config + torch.manual_seed(0) + state = torch.randn(1, 1, config.state_dim).to(device=torch_device, dtype=torch.float32) # env.reset() + + expected_outputs = torch.tensor( + [[0.242793, -0.28693074, 0.8742613], [0.67815274, -0.08101085, -0.12952147]], device=torch_device + ) + + returns_to_go = torch.tensor(TARGET_RETURN, device=torch_device, dtype=torch.float32).reshape(1, 1, 1) + states = state + actions = torch.zeros(1, 0, config.act_dim, device=torch_device, dtype=torch.float32) + rewards = torch.zeros(1, 0, device=torch_device, dtype=torch.float32) + timesteps = torch.tensor(0, device=torch_device, dtype=torch.long).reshape(1, 1) + + for step in range(NUM_STEPS): + actions = torch.cat([actions, torch.zeros(1, 1, config.act_dim, device=torch_device)], dim=1) + rewards = torch.cat([rewards, torch.zeros(1, 1, device=torch_device)], dim=1) + + attention_mask = torch.ones(1, states.shape[1]).to(dtype=torch.long, device=states.device) + + with torch.no_grad(): + _, action_pred, _ = model( + states=states, + actions=actions, + rewards=rewards, + returns_to_go=returns_to_go, + timesteps=timesteps, + attention_mask=attention_mask, + return_dict=False, + ) + + self.assertEqual(action_pred.shape, actions.shape) + torch.testing.assert_close(action_pred[0, -1], expected_outputs[step], rtol=1e-4, atol=1e-4) + state, reward, _, _ = ( # env.step(action) + torch.randn(1, 1, config.state_dim).to(device=torch_device, dtype=torch.float32), + 1.0, + False, + {}, + ) + + actions[-1] = action_pred[0, -1] + states = torch.cat([states, state], dim=1) + pred_return = returns_to_go[0, -1] - reward + returns_to_go = torch.cat([returns_to_go, pred_return.reshape(1, 1, 1)], dim=1) + timesteps = torch.cat( + [timesteps, torch.ones((1, 1), device=torch_device, dtype=torch.long) * (step + 1)], dim=1 + ) diff --git a/docs/transformers/tests/models/deepseek_v3/__init__.py b/docs/transformers/tests/models/deepseek_v3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/deepseek_v3/test_modeling_deepseek_v3.py b/docs/transformers/tests/models/deepseek_v3/test_modeling_deepseek_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..1c2690b54edf0033b14b998d1584beb35e9af711 --- /dev/null +++ b/docs/transformers/tests/models/deepseek_v3/test_modeling_deepseek_v3.py @@ -0,0 +1,565 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DeepseekV3 model.""" + +import unittest + +from packaging import version +from parameterized import parameterized + +from transformers import AutoTokenizer, DeepseekV3Config, is_torch_available, set_seed +from transformers.testing_utils import ( + cleanup, + require_read_token, + require_torch, + require_torch_accelerator, + require_torch_sdpa, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + DeepseekV3ForCausalLM, + DeepseekV3Model, + ) + from transformers.models.deepseek_v3.modeling_deepseek_v3 import ( + DeepseekV3RotaryEmbedding, + ) + + +class DeepseekV3ModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + intermediate_size=37, + moe_intermediate_size=12, + num_hidden_layers=5, + num_attention_heads=4, + num_key_value_heads=4, + n_shared_experts=1, + n_routed_experts=8, + routed_scaling_factor=2.5, + kv_lora_rank=16, + q_lora_rank=32, + qk_rope_head_dim=16, + v_head_dim=32, + qk_nope_head_dim=32, + n_group=2, + topk_group=1, + num_experts_per_tok=8, + first_k_dense_replace=2, + norm_topk_prob=True, + aux_loss_alpha=0.001, + hidden_act="silu", + max_position_embeddings=512, + initializer_range=0.02, + attention_probs_dropout_prob=0.1, + type_vocab_size=16, + type_sequence_label_size=2, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.moe_intermediate_size = moe_intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.n_shared_experts = n_shared_experts + self.n_routed_experts = n_routed_experts + self.routed_scaling_factor = routed_scaling_factor + self.kv_lora_rank = kv_lora_rank + self.q_lora_rank = q_lora_rank + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.qk_nope_head_dim = qk_nope_head_dim + self.n_group = n_group + self.topk_group = topk_group + self.num_experts_per_tok = num_experts_per_tok + self.first_k_dense_replace = first_k_dense_replace + self.norm_topk_prob = norm_topk_prob + self.aux_loss_alpha = aux_loss_alpha + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return DeepseekV3Config( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + intermediate_size=self.intermediate_size, + moe_intermediate_size=self.moe_intermediate_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + num_key_value_heads=self.num_key_value_heads, + n_shared_experts=self.n_shared_experts, + n_routed_experts=self.n_routed_experts, + routed_scaling_factor=self.routed_scaling_factor, + kv_lora_rank=self.kv_lora_rank, + q_lora_rank=self.q_lora_rank, + qk_rope_head_dim=self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + qk_nope_head_dim=self.qk_nope_head_dim, + n_group=self.n_group, + topk_group=self.topk_group, + num_experts_per_tok=self.num_experts_per_tok, + first_k_dense_replace=self.first_k_dense_replace, + norm_topk_prob=self.norm_topk_prob, + aux_loss_alpha=self.aux_loss_alpha, + hidden_act=self.hidden_act, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + use_cache=True, + pad_token_id=self.pad_token_id, + attention_dropout=self.attention_probs_dropout_prob, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DeepseekV3Model(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class DeepseekV3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + DeepseekV3Model, + DeepseekV3ForCausalLM, + ) + if is_torch_available() + else () + ) + all_generative_model_classes = (DeepseekV3ForCausalLM,) if is_torch_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": DeepseekV3Model, + "text-generation": DeepseekV3ForCausalLM, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + fx_compatible = False + + # Need to use `0.8` instead of `0.9` for `test_cpu_offload` + # This is because we are hitting edge cases with the causal_mask buffer + model_split_percents = [0.5, 0.7, 0.8] + + # used in `test_torch_compile_for_training` + _torch_compile_train_cls = DeepseekV3ForCausalLM if is_torch_available() else None + + def setUp(self): + self.model_tester = DeepseekV3ModelTester(self) + self.config_tester = ConfigTester(self, config_class=DeepseekV3Config, hidden_size=37) + + @unittest.skip("Failing because of unique cache (HybridCache)") + def test_model_outputs_equivalence(self, **kwargs): + pass + + @parameterized.expand([("random",), ("same",)]) + @unittest.skip("DeepseekV3 has HybridCache which is not compatible with assisted decoding") + def test_assisted_decoding_matches_greedy_search(self, assistant_type): + pass + + @unittest.skip("DeepseekV3 has HybridCache which is not compatible with assisted decoding") + def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type): + pass + + @unittest.skip("DeepseekV3 has HybridCache which is not compatible with assisted decoding") + def test_assisted_decoding_sample(self): + pass + + @unittest.skip("DeepseekV3 has HybridCache which is not compatible with dola decoding") + def test_dola_decoding_sample(self): + pass + + @unittest.skip("DeepseekV3 has HybridCache and doesn't support continue from past kv") + def test_generate_continue_from_past_key_values(self): + pass + + @unittest.skip("DeepseekV3 has HybridCache and doesn't support low_memory generation") + def test_beam_search_low_memory(self): + pass + + @unittest.skip("DeepseekV3 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate(self): + pass + + @unittest.skip("DeepseekV3 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip("DeepseekV3 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip( + "DeepseekV3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support." + ) + def test_generate_with_static_cache(self): + pass + + @unittest.skip( + "DeepseekV3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support." + ) + def test_generate_from_inputs_embeds_with_static_cache(self): + pass + + @unittest.skip( + "DeepseekV3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support." + ) + def test_generate_continue_from_inputs_embeds(self): + pass + + @unittest.skip("DeepseekV3's eager attn/sdpa attn outputs are expected to be different") + def test_sdpa_equivalence(self): + pass + + @unittest.skip("Deepseek-V3 uses MLA so it is not compatible with the standard cache format") + def test_beam_search_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip("Deepseek-V3 uses MLA so it is not compatible with the standard cache format") + def test_generate_compilation_all_outputs(self): + pass + + @unittest.skip("Deepseek-V3 uses MLA so it is not compatible with the standard cache format") + def test_generate_compile_model_forward(self): + pass + + @unittest.skip("Deepseek-V3 uses MLA so it is not compatible with the standard cache format") + def test_greedy_generate_dict_outputs_use_cache(self): + pass + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + @parameterized.expand([("yarn",)]) + def test_model_rope_scaling_from_config(self, scaling_type): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + short_input = ids_tensor([1, 10], config.vocab_size) + long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + original_model = DeepseekV3Model(config) + original_model.to(torch_device) + original_model.eval() + original_short_output = original_model(short_input).last_hidden_state + original_long_output = original_model(long_input).last_hidden_state + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + config.rope_scaling = {"type": scaling_type, "factor": 10.0} + scaled_model = DeepseekV3Model(config) + scaled_model.to(torch_device) + scaled_model.eval() + scaled_short_output = scaled_model(short_input).last_hidden_state + scaled_long_output = scaled_model(long_input).last_hidden_state + + # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original + # maximum sequence length, so the outputs for the short input should match. + if scaling_type == "dynamic": + torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) + else: + self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) + + # The output should be different for long inputs + self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + + def test_model_rope_scaling(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + + # Inputs + x = torch.randn(1, dtype=torch.float32, device=torch_device) # used exlusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Sanity check original RoPE + original_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device) + original_cos_short, original_sin_short = original_rope(x, position_ids_short) + original_cos_long, original_sin_long = original_rope(x, position_ids_long) + torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) + + # Sanity check linear RoPE scaling + # New position "x" should match original position with index "x/scaling_factor" + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + linear_scaling_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device) + linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) + linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) + torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) + for new_position in range(0, long_input_length, scaling_factor): + original_position = int(new_position // scaling_factor) + torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) + torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) + + # Sanity check Dynamic NTK RoPE scaling + # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase + # with scaling_factor (or that `inv_freq` decreases) + config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} + ntk_scaling_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device) + ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) + ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) + torch.testing.assert_close(ntk_cos_short, original_cos_short) + torch.testing.assert_close(ntk_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_sin_long, original_sin_long) + self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) + + # Sanity check Yarn RoPE scaling + # Scaling should be over the entire input + config.rope_scaling = {"type": "yarn", "factor": scaling_factor} + yarn_scaling_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device) + yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) + yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) + torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_short, original_cos_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_long, original_sin_long) + + def test_past_key_values_format(self): + """ + Overwritting to pass the expected cache shapes (Deepseek-V3 uses MLA so the cache shapes are non-standard) + """ + config, inputs = self.model_tester.prepare_config_and_inputs_for_common() + batch_size, seq_length = inputs["input_ids"].shape + # difference: last dim + k_embed_dim = config.qk_nope_head_dim + config.qk_rope_head_dim + v_embed_dim = config.v_head_dim + self_attention_key_cache_shape = (batch_size, config.num_key_value_heads, seq_length, k_embed_dim) + self_attention_value_cache_shape = (batch_size, config.num_key_value_heads, seq_length, v_embed_dim) + # build the full cache shapes + num_hidden_layers = config.num_hidden_layers + all_cache_shapes = [ + [self_attention_key_cache_shape, self_attention_value_cache_shape] for _ in range(num_hidden_layers) + ] + super().test_past_key_values_format(custom_all_cache_shapes=all_cache_shapes) + + @require_torch_sdpa + @slow + def test_eager_matches_sdpa_generate(self): + """ + Overwritting the common test as the test is flaky on tiny models + """ + max_new_tokens = 30 + + tokenizer = AutoTokenizer.from_pretrained("bzantium/tiny-deepseek-v3") + + model_sdpa = DeepseekV3ForCausalLM.from_pretrained( + "bzantium/tiny-deepseek-v3", + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = DeepseekV3ForCausalLM.from_pretrained( + "bzantium/tiny-deepseek-v3", + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + texts = [ + "hi here's a longer context, getting longer and", + "Hello this is a very long sentence my friend, very long for real", + "Today I am in Paris and", + ] + + for padding_side in ["left", "right"]: + tokenizer.padding_side = padding_side + tokenizer.pad_token = tokenizer.eos_token + + inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) + + res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) + res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) + + with self.subTest(f"{padding_side}"): + torch.testing.assert_close( + res_eager, + res_sdpa, + msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", + ) + + +@require_torch_accelerator +class DeepseekV3IntegrationTest(unittest.TestCase): + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + def tearDown(self): + # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed. + cleanup(torch_device, gc_collect=False) + + @slow + @require_torch_accelerator + @require_read_token + def test_compile_static_cache(self): + # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2 + # work as intended. See https://github.com/pytorch/pytorch/issues/121943 + if version.parse(torch.__version__) < version.parse("2.3.0"): + self.skipTest(reason="This test requires torch >= 2.3 to run.") + + NUM_TOKENS_TO_GENERATE = 40 + # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test + # was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs. + EXPECTED_TEXT_COMPLETION = [ + "Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial " + "reference frames, and 2) the laws of physics are the same for all inertial reference frames.\nThe " + "theory of relativ", + "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, " + "my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p", + ] + + prompts = [ + "Simply put, the theory of relativity states that ", + "My favorite all time favorite condiment is ketchup.", + ] + tokenizer = AutoTokenizer.from_pretrained("bzantium/tiny-deepseek-v3", pad_token="", padding_side="right") + model = DeepseekV3ForCausalLM.from_pretrained( + "bzantium/tiny-deepseek-v3", device_map=torch_device, torch_dtype=torch.float16 + ) + inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + + # Dynamic Cache + generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False) + dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, dynamic_text) + + # Static Cache + generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static" + ) + static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text) + + # Static Cache + compile + model._cache = None # clear cache object, initialized when we pass `cache_implementation="static"` + model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) + generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static" + ) + static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text) diff --git a/docs/transformers/tests/models/deformable_detr/__init__.py b/docs/transformers/tests/models/deformable_detr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/docs/transformers/tests/models/deformable_detr/test_image_processing_deformable_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..f0fb0db022ba00cf994c24191f5c78d38b418ac1 --- /dev/null +++ b/docs/transformers/tests/models/deformable_detr/test_image_processing_deformable_detr.py @@ -0,0 +1,734 @@ +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import pathlib +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import DeformableDetrImageProcessor, DeformableDetrImageProcessorFast + + +class DeformableDetrImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_rescale=True, + rescale_factor=1 / 255, + do_pad=True, + ): + # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p + size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_pad = do_pad + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_rescale": self.do_rescale, + "rescale_factor": self.rescale_factor, + "do_pad": self.do_pad, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to DeformableDetrImageProcessor, + assuming do_resize is set to True with a scalar size. + """ + if not batched: + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] + else: + h, w = image.shape[1], image.shape[2] + if w < h: + expected_height = int(self.size["shortest_edge"] * h / w) + expected_width = self.size["shortest_edge"] + elif w > h: + expected_height = self.size["shortest_edge"] + expected_width = int(self.size["shortest_edge"] * w / h) + else: + expected_height = self.size["shortest_edge"] + expected_width = self.size["shortest_edge"] + + else: + expected_values = [] + for image in image_inputs: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + def expected_output_image_shape(self, images): + height, width = self.get_expected_values(images, batched=True) + return self.num_channels, height, width + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None + fast_image_processing_class = DeformableDetrImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = DeformableDetrImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "do_pad")) + self.assertTrue(hasattr(image_processing, "size")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) + self.assertEqual(image_processor.do_pad, True) + + image_processor = image_processing_class.from_dict( + self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False + ) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) + self.assertEqual(image_processor.do_pad, False) + + @slow + def test_call_pytorch_with_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class() + encoding = image_processing(images=image, annotations=target, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + # verify area + expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) + torch.testing.assert_close(encoding["labels"][0]["area"], expected_area) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) + torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3) + # verify image_id + expected_image_id = torch.tensor([39769]) + torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd) + # verify class_labels + expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) + torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size) + # verify size + expected_size = torch.tensor([800, 1066]) + torch.testing.assert_close(encoding["labels"][0]["size"], expected_size) + + @slow + def test_call_pytorch_with_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class(format="coco_panoptic") + encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + # verify area + expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) + torch.testing.assert_close(encoding["labels"][0]["area"], expected_area) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) + torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3) + # verify image_id + expected_image_id = torch.tensor([39769]) + torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd) + # verify class_labels + expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) + torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels) + # verify masks + expected_masks_sum = 822873 + relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum + self.assertTrue(relative_error < 1e-3) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size) + # verify size + expected_size = torch.tensor([800, 1066]) + torch.testing.assert_close(encoding["labels"][0]["size"], expected_size) + + @slow + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->DeformableDetr + def test_batched_coco_detection_annotations(self): + image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800)) + + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + target = json.loads(f.read()) + + annotations_0 = {"image_id": 39769, "annotations": target} + annotations_1 = {"image_id": 39769, "annotations": target} + + # Adjust the bounding boxes for the resized image + w_0, h_0 = image_0.size + w_1, h_1 = image_1.size + for i in range(len(annotations_1["annotations"])): + coords = annotations_1["annotations"][i]["bbox"] + new_bbox = [ + coords[0] * w_1 / w_0, + coords[1] * h_1 / h_0, + coords[2] * w_1 / w_0, + coords[3] * h_1 / h_0, + ] + annotations_1["annotations"][i]["bbox"] = new_bbox + + images = [image_0, image_1] + annotations = [annotations_0, annotations_1] + + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class() + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + return_tensors="pt", # do_convert_annotations=True + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.6879, 0.4609, 0.0755, 0.3691], + [0.2118, 0.3359, 0.2601, 0.1566], + [0.5011, 0.5000, 0.9979, 1.0000], + [0.5010, 0.5020, 0.9979, 0.9959], + [0.3284, 0.5944, 0.5884, 0.8112], + [0.8394, 0.5445, 0.3213, 0.9110], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.4130, 0.2765, 0.0453, 0.2215], + [0.1272, 0.2016, 0.1561, 0.0940], + [0.3757, 0.4933, 0.7488, 0.9865], + [0.3759, 0.5002, 0.7492, 0.9955], + [0.1971, 0.5456, 0.3532, 0.8646], + [0.5790, 0.4115, 0.3430, 0.7161], + ] + ) + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3, rtol=1e-3) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->DeformableDetr + def test_batched_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800)) + + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f: + target = json.loads(f.read()) + + annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + w_0, h_0 = image_0.size + w_1, h_1 = image_1.size + for i in range(len(annotation_1["segments_info"])): + coords = annotation_1["segments_info"][i]["bbox"] + new_bbox = [ + coords[0] * w_1 / w_0, + coords[1] * h_1 / h_0, + coords[2] * w_1 / w_0, + coords[3] * h_1 / h_0, + ] + annotation_1["segments_info"][i]["bbox"] = new_bbox + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + images = [image_0, image_1] + annotations = [annotation_0, annotation_1] + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class(format="coco_panoptic") + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_tensors="pt", + return_segmentation_masks=True, + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.2625, 0.5437, 0.4688, 0.8625], + [0.7719, 0.4104, 0.4531, 0.7125], + [0.5000, 0.4927, 0.9969, 0.9854], + [0.1688, 0.2000, 0.2063, 0.0917], + [0.5492, 0.2760, 0.0578, 0.2187], + [0.4992, 0.4990, 0.9984, 0.9979], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.1576, 0.3262, 0.2814, 0.5175], + [0.4634, 0.2463, 0.2720, 0.4275], + [0.3002, 0.2956, 0.5985, 0.5913], + [0.1013, 0.1200, 0.1238, 0.0550], + [0.3297, 0.1656, 0.0347, 0.1312], + [0.2997, 0.2994, 0.5994, 0.5987], + ] + ) + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3, rtol=1e-3) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr + def test_max_width_max_height_resizing_and_pad_strategy(self): + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = image_processing_class( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + + def test_longest_edge_shortest_edge_resizing_strategy(self): + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) + + # max size is set; width < height; + # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 + image_processor = image_processing_class( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436])) + + image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) + # max size is set; height < width; + # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 + image_processor = image_processing_class( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640])) + + image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) + # max size is set; width == size; height > max_size; + # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 + image_processor = image_processing_class( + size={"longest_edge": 118, "shortest_edge": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_3], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118])) + + image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) + # max size is set; height == size; width < max_size; + # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 + image_processor = image_processing_class( + size={"longest_edge": 256, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_4], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50])) + + image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) + # max size is set; height == width; width < max_size; + # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 + image_processor = image_processing_class( + size={"longest_edge": 117, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_5], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50])) + + @slow + @require_torch_gpu + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations + def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + # Ignore copy + processor = self.image_processor_list[1]() + + # 1. run processor on CPU + encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu") + # 2. run processor on GPU + encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda") + + # verify pixel values + self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["pixel_values"][0, 0, 0, :3], + encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"), + atol=1e-4, + ) + ) + # verify area + torch.testing.assert_close(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu")) + # verify boxes + self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3 + ) + ) + # verify image_id + torch.testing.assert_close( + encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu") + ) + # verify is_crowd + torch.testing.assert_close( + encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu") + ) + # verify class_labels + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu") + ) + ) + # verify orig_size + torch.testing.assert_close( + encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu") + ) + # verify size + torch.testing.assert_close(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu")) + + @slow + @require_torch_gpu + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations + def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + # Ignore copy + processor = self.image_processor_list[1](format="coco_panoptic") + + # 1. run processor on CPU + encoding_cpu = processor( + images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu" + ) + # 2. run processor on GPU + encoding_gpu = processor( + images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cuda" + ) + + # verify pixel values + self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["pixel_values"][0, 0, 0, :3], + encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"), + atol=1e-4, + ) + ) + # verify area + torch.testing.assert_close(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu")) + # verify boxes + self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3 + ) + ) + # verify image_id + torch.testing.assert_close( + encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu") + ) + # verify is_crowd + torch.testing.assert_close( + encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu") + ) + # verify class_labels + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu") + ) + ) + # verify masks + masks_sum_cpu = encoding_cpu["labels"][0]["masks"].sum() + masks_sum_gpu = encoding_gpu["labels"][0]["masks"].sum() + relative_error = torch.abs(masks_sum_cpu - masks_sum_gpu) / masks_sum_cpu + self.assertTrue(relative_error < 1e-3) + # verify orig_size + torch.testing.assert_close( + encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu") + ) + # verify size + torch.testing.assert_close(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu")) diff --git a/docs/transformers/tests/models/deformable_detr/test_modeling_deformable_detr.py b/docs/transformers/tests/models/deformable_detr/test_modeling_deformable_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..6274a7e1efb31fbc05875ca62007ae0f13852248 --- /dev/null +++ b/docs/transformers/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -0,0 +1,774 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Deformable DETR model.""" + +import inspect +import math +import unittest + +from transformers import DeformableDetrConfig, ResNetConfig, is_torch_available, is_vision_available +from transformers.file_utils import cached_property +from transformers.testing_utils import ( + require_timm, + require_torch, + require_torch_accelerator, + require_torch_bf16, + require_vision, + slow, + torch_device, +) + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import DeformableDetrForObjectDetection, DeformableDetrModel + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class DeformableDetrModelTester: + def __init__( + self, + parent, + batch_size=8, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=8, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_queries=12, + num_channels=3, + image_size=196, + n_targets=8, + num_labels=91, + num_feature_levels=4, + encoder_n_points=2, + decoder_n_points=6, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_queries = num_queries + self.num_channels = num_channels + self.image_size = image_size + self.n_targets = n_targets + self.num_labels = num_labels + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + + # we also set the expected seq length for both encoder and decoder + self.encoder_seq_length = ( + math.ceil(self.image_size / 8) ** 2 + + math.ceil(self.image_size / 16) ** 2 + + math.ceil(self.image_size / 32) ** 2 + + math.ceil(self.image_size / 64) ** 2 + ) + self.decoder_seq_length = self.num_queries + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device) + + labels = None + if self.use_labels: + # labels is a list of Dict (each Dict being the labels for a given example in the batch) + labels = [] + for i in range(self.batch_size): + target = {} + target["class_labels"] = torch.randint( + high=self.num_labels, size=(self.n_targets,), device=torch_device + ) + target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device) + target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device) + labels.append(target) + + config = self.get_config() + return config, pixel_values, pixel_mask, labels + + def get_config(self): + resnet_config = ResNetConfig( + num_channels=3, + embeddings_size=10, + hidden_sizes=[10, 20, 30, 40], + depths=[1, 1, 2, 1], + hidden_act="relu", + num_labels=3, + out_features=["stage2", "stage3", "stage4"], + out_indices=[2, 3, 4], + ) + return DeformableDetrConfig( + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + num_queries=self.num_queries, + num_labels=self.num_labels, + num_feature_levels=self.num_feature_levels, + encoder_n_points=self.encoder_n_points, + decoder_n_points=self.decoder_n_points, + use_timm_backbone=False, + backbone=None, + backbone_config=resnet_config, + use_pretrained_backbone=False, + ) + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask} + return config, inputs_dict + + def create_and_check_deformable_detr_model(self, config, pixel_values, pixel_mask, labels): + model = DeformableDetrModel(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size)) + + def create_and_check_deformable_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels): + model = DeformableDetrForObjectDetection(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels) + + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + +@require_torch +class DeformableDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (DeformableDetrModel, DeformableDetrForObjectDetection) if is_torch_available() else () + pipeline_model_mapping = ( + {"image-feature-extraction": DeformableDetrModel, "object-detection": DeformableDetrForObjectDetection} + if is_torch_available() + else {} + ) + is_encoder_decoder = True + test_torchscript = False + test_pruning = False + test_head_masking = False + test_missing_keys = False + test_torch_exportable = True + + # special case for head models + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "DeformableDetrForObjectDetection": + labels = [] + for i in range(self.model_tester.batch_size): + target = {} + target["class_labels"] = torch.ones( + size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long + ) + target["boxes"] = torch.ones( + self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float + ) + target["masks"] = torch.ones( + self.model_tester.n_targets, + self.model_tester.image_size, + self.model_tester.image_size, + device=torch_device, + dtype=torch.float, + ) + labels.append(target) + inputs_dict["labels"] = labels + + return inputs_dict + + def setUp(self): + self.model_tester = DeformableDetrModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=DeformableDetrConfig, + has_text_modality=False, + common_properties=["num_channels", "d_model", "encoder_attention_heads", "decoder_attention_heads"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_deformable_detr_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deformable_detr_model(*config_and_inputs) + + def test_deformable_detr_object_detection_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deformable_detr_object_detection_head_model(*config_and_inputs) + + @unittest.skip(reason="Deformable DETR does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Deformable DETR does not use inputs_embeds") + def test_inputs_embeds_matches_input_ids(self): + pass + + @unittest.skip(reason="Deformable DETR does not have a get_input_embeddings method") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="Deformable DETR is not a generative model") + def test_generate_without_input_ids(self): + pass + + @unittest.skip(reason="Deformable DETR does not use token embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Feed forward chunking is not implemented") + def test_feed_forward_chunking(self): + pass + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.encoder_n_points, + ], + ) + out_len = len(outputs) + + correct_outlen = 8 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Object Detection model returns pred_logits and pred_boxes + if model_class.__name__ == "DeformableDetrForObjectDetection": + correct_outlen += 2 + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, self.model_tester.num_queries, self.model_tester.num_queries], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.decoder_n_points, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + added_hidden_states = 2 + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.encoder_n_points, + ], + ) + + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def set_nan_tensor_to_zero(t): + t[t != t] = 0 + return t + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (list, tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + torch.allclose( + set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 + ), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:" + f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has" + f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}." + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + print("Model class:", model_class) + model = model_class(config) + model.to(torch_device) + model.eval() + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence( + model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True} + ) + + def test_retain_grad_hidden_states_attentions(self): + # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + # we take the second output since last_hidden_state is the second item + output = outputs[1] + + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_attentions = outputs.encoder_attentions[0] + encoder_hidden_states.retain_grad() + encoder_attentions.retain_grad() + + decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions.retain_grad() + + cross_attentions = outputs.cross_attentions[0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + + def test_forward_auxiliary_loss(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.auxiliary_loss = True + + # only test for object detection and segmentation model + for model_class in self.all_model_classes[1:]: + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + + outputs = model(**inputs) + + self.assertIsNotNone(outputs.auxiliary_outputs) + self.assertEqual(len(outputs.auxiliary_outputs), self.model_tester.num_hidden_layers - 1) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + if model.config.is_encoder_decoder: + expected_arg_names = ["pixel_values", "pixel_mask"] + expected_arg_names.extend( + ["head_mask", "decoder_head_mask", "encoder_outputs"] + if "head_mask" and "decoder_head_mask" in arg_names + else [] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + else: + expected_arg_names = ["pixel_values", "pixel_mask"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_different_timm_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # let's pick a random timm backbone + config.backbone = "tf_mobilenetv3_small_075" + config.backbone_config = None + config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [1, 2, 3, 4]} + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "DeformableDetrForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4) + else: + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4) + + self.assertTrue(outputs) + + def test_hf_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Load a pretrained HF checkpoint as backbone + config.backbone = "microsoft/resnet-18" + config.backbone_config = None + config.use_timm_backbone = False + config.use_pretrained_backbone = True + config.backbone_kwargs = {"out_indices": [1, 2, 3, 4]} + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "DeformableDetrForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4) + else: + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4) + + self.assertTrue(outputs) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + print("Model class:", model_class) + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + if param.requires_grad: + if ( + "level_embed" in name + or "sampling_offsets.bias" in name + or "value_proj" in name + or "output_proj" in name + or "reference_points" in name + ): + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + + def test_two_stage_training(self): + model_class = DeformableDetrForObjectDetection + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + config.two_stage = True + config.auxiliary_loss = True + config.with_box_refine = True + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def create_and_check_model_fp16_forward(self): + model_class = DeformableDetrForObjectDetection + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + model = model_class(config) + model.to(torch_device) + model.half() + model.eval() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + output = model(**inputs)["last_hidden_state"] + self.parent.assertFalse(torch.isnan(output).any().item()) + + @require_torch_bf16 + def create_and_check_model_bf16_forward(self): + model_class = DeformableDetrForObjectDetection + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + model = model_class(config, torch_dtype=torch.bfloat16) + model.to(torch_device) + model.eval() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + output = model(**inputs)["last_hidden_state"] + self.parent.assertFalse(torch.isnan(output).any().item()) + + +TOLERANCE = 1e-4 + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_timm +@require_vision +@slow +class DeformableDetrModelIntegrationTests(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") if is_vision_available() else None + + def test_inference_object_detection_head(self): + model = DeformableDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + + expected_logits = torch.tensor( + [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]] + ).to(torch_device) + expected_boxes = torch.tensor( + [[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]] + ).to(torch_device) + + torch.testing.assert_close(outputs.logits[0, :3, :3], expected_logits, rtol=1e-4, atol=1e-4) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_boxes, rtol=1e-4, atol=1e-4) + + # verify postprocessing + results = image_processor.post_process_object_detection( + outputs, threshold=0.3, target_sizes=[image.size[::-1]] + )[0] + expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382]).to(torch_device) + expected_labels = [17, 17, 75, 75, 63] + expected_slice_boxes = torch.tensor([16.5028, 52.8390, 318.2544, 470.7841]).to(torch_device) + + self.assertEqual(len(results["scores"]), 5) + torch.testing.assert_close(results["scores"], expected_scores, rtol=1e-4, atol=1e-4) + self.assertSequenceEqual(results["labels"].tolist(), expected_labels) + torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes) + + def test_inference_object_detection_head_with_box_refine_two_stage(self): + model = DeformableDetrForObjectDetection.from_pretrained( + "SenseTime/deformable-detr-with-box-refine-two-stage" + ).to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + + expected_logits = torch.tensor( + [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]] + ).to(torch_device) + expected_boxes = torch.tensor( + [[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]] + ).to(torch_device) + + torch.testing.assert_close(outputs.logits[0, :3, :3], expected_logits, rtol=1e-4, atol=1e-4) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_boxes, rtol=1e-4, atol=1e-4) + + @require_torch_accelerator + def test_inference_object_detection_head_equivalence_cpu_gpu(self): + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt") + pixel_values = encoding["pixel_values"] + pixel_mask = encoding["pixel_mask"] + + # 1. run model on CPU + model = DeformableDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr-single-scale") + + with torch.no_grad(): + cpu_outputs = model(pixel_values, pixel_mask) + + # 2. run model on GPU + model.to(torch_device) + + with torch.no_grad(): + gpu_outputs = model(pixel_values.to(torch_device), pixel_mask.to(torch_device)) + + # 3. assert equivalence + for key in cpu_outputs.keys(): + assert torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4) + + expected_logits = torch.tensor( + [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]] + ) + assert torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-4) diff --git a/docs/transformers/tests/models/deit/__init__.py b/docs/transformers/tests/models/deit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/deit/test_image_processing_deit.py b/docs/transformers/tests/models/deit/test_image_processing_deit.py new file mode 100644 index 0000000000000000000000000000000000000000..9bc204f41aa839523bdca65d95b5ba0c1e40a996 --- /dev/null +++ b/docs/transformers/tests/models/deit/test_image_processing_deit.py @@ -0,0 +1,125 @@ +# Copyright 2021 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_vision_available(): + from transformers import DeiTImageProcessor + + if is_torchvision_available(): + from transformers import DeiTImageProcessorFast + + +class DeiTImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + size = size if size is not None else {"height": 20, "width": 20} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.crop_size["height"], self.crop_size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class DeiTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = DeiTImageProcessor if is_vision_available() else None + fast_image_processing_class = DeiTImageProcessorFast if is_torchvision_available() else None + test_cast_dtype = True + + def setUp(self): + super().setUp() + self.image_processor_tester = DeiTImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 20, "width": 20}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) diff --git a/docs/transformers/tests/models/deit/test_modeling_deit.py b/docs/transformers/tests/models/deit/test_modeling_deit.py new file mode 100644 index 0000000000000000000000000000000000000000..50ccdbfb5fdf8c5a8393d96825967ba43401d02a --- /dev/null +++ b/docs/transformers/tests/models/deit/test_modeling_deit.py @@ -0,0 +1,467 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DeiT model.""" + +import unittest +import warnings + +from transformers import DeiTConfig +from transformers.testing_utils import ( + require_accelerate, + require_torch, + require_torch_accelerator, + require_torch_fp16, + require_vision, + slow, + torch_device, +) +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ( + DeiTForImageClassification, + DeiTForImageClassificationWithTeacher, + DeiTForMaskedImageModeling, + DeiTModel, + ) + from transformers.models.auto.modeling_auto import ( + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, + MODEL_MAPPING_NAMES, + ) + + +if is_vision_available(): + from PIL import Image + + from transformers import DeiTImageProcessor + + +class DeiTModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + type_sequence_label_size=10, + initializer_range=0.02, + num_labels=3, + scope=None, + encoder_stride=2, + mask_ratio=0.5, + attn_implementation="eager", + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.scope = scope + self.encoder_stride = encoder_stride + self.attn_implementation = attn_implementation + + # in DeiT, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distilation tokens) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 2 + self.mask_ratio = mask_ratio + self.num_masks = int(mask_ratio * self.seq_length) + self.mask_length = num_patches + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DeiTConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + encoder_stride=self.encoder_stride, + attn_implementation=self.attn_implementation, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DeiTModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels): + model = DeiTForMaskedImageModeling(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual( + result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size) + ) + + # test greyscale images + config.num_channels = 1 + model = DeiTForMaskedImageModeling(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = DeiTForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = DeiTForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + pixel_values, + labels, + ) = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DeiTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as DeiT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + ( + DeiTModel, + DeiTForImageClassification, + DeiTForImageClassificationWithTeacher, + DeiTForMaskedImageModeling, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "image-feature-extraction": DeiTModel, + "image-classification": (DeiTForImageClassification, DeiTForImageClassificationWithTeacher), + } + if is_torch_available() + else {} + ) + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = DeiTModelTester(self) + self.config_tester = ConfigTester(self, config_class=DeiTConfig, has_text_modality=False, hidden_size=37) + + @unittest.skip( + "Since `torch==2.3+cu121`, although this test passes, many subsequent tests have `CUDA error: misaligned address`." + "If `nvidia-xxx-cu118` are also installed, no failure (even with `torch==2.3+cu121`)." + ) + def test_multi_gpu_data_parallel_forward(self): + super().test_multi_gpu_data_parallel_forward() + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="DeiT does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_image_modeling(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + # special case for DeiTForImageClassificationWithTeacher model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "DeiTForImageClassificationWithTeacher": + del inputs_dict["labels"] + + return inputs_dict + + def test_training(self): + if not self.model_tester.is_training: + self.skipTest(reason="model_tester.is_training is set to False") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + # DeiTForImageClassificationWithTeacher supports inference-only + if ( + model_class.__name__ in MODEL_MAPPING_NAMES.values() + or model_class.__name__ == "DeiTForImageClassificationWithTeacher" + ): + continue + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + if not self.model_tester.is_training: + self.skipTest(reason="model_tester.is_training is set to False") + + config.use_cache = False + config.return_dict = True + + for model_class in self.all_model_classes: + if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing: + continue + # DeiTForImageClassificationWithTeacher supports inference-only + if model_class.__name__ == "DeiTForImageClassificationWithTeacher": + continue + model = model_class(config) + model.gradient_checkpointing_enable() + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def test_problem_types(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + problem_types = [ + {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float}, + {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long}, + {"title": "regression", "num_labels": 1, "dtype": torch.float}, + ] + + for model_class in self.all_model_classes: + if ( + model_class.__name__ + not in [ + *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values(), + *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES.values(), + ] + or model_class.__name__ == "DeiTForImageClassificationWithTeacher" + ): + continue + + for problem_type in problem_types: + with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"): + config.problem_type = problem_type["title"] + config.num_labels = problem_type["num_labels"] + + model = model_class(config) + model.to(torch_device) + model.train() + + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + + if problem_type["num_labels"] > 1: + inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"]) + + inputs["labels"] = inputs["labels"].to(problem_type["dtype"]) + + # This tests that we do not trigger the warning form PyTorch "Using a target size that is different + # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure + # they have the same size." which is a symptom something in wrong for the regression problem. + # See https://github.com/huggingface/transformers/issues/11780 + with warnings.catch_warnings(record=True) as warning_list: + loss = model(**inputs).loss + for w in warning_list: + if "Using a target size that is different to the input size" in str(w.message): + raise ValueError( + f"Something is going wrong in the regression problem: intercepted {w.message}" + ) + + loss.backward() + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/deit-base-distilled-patch16-224" + model = DeiTModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class DeiTModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224") + if is_vision_available() + else None + ) + + @slow + def test_inference_image_classification_head(self): + model = DeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224").to( + torch_device + ) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor([-1.0266, 0.1912, -1.2861]).to(torch_device) + + torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + @slow + def test_inference_interpolate_pos_encoding(self): + model = DeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224").to( + torch_device + ) + + image_processor = self.default_image_processor + + # image size is {"height": 480, "width": 640} + image = prepare_img() + image_processor.size = {"height": 480, "width": 640} + # center crop set to False so image is not center cropped to 224x224 + inputs = image_processor(images=image, return_tensors="pt", do_center_crop=False).to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + @slow + @require_accelerate + @require_torch_accelerator + @require_torch_fp16 + def test_inference_fp16(self): + r""" + A small test to make sure that inference work in half precision without any problem. + """ + model = DeiTModel.from_pretrained( + "facebook/deit-base-distilled-patch16-224", torch_dtype=torch.float16, device_map="auto" + ) + image_processor = self.default_image_processor + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt") + pixel_values = inputs.pixel_values.to(torch_device) + + # forward pass to make sure inference works in fp16 + with torch.no_grad(): + _ = model(pixel_values) diff --git a/docs/transformers/tests/models/deit/test_modeling_tf_deit.py b/docs/transformers/tests/models/deit/test_modeling_tf_deit.py new file mode 100644 index 0000000000000000000000000000000000000000..1ca091f52695066f3192ac2851b50fb072bd8f98 --- /dev/null +++ b/docs/transformers/tests/models/deit/test_modeling_tf_deit.py @@ -0,0 +1,311 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the TensorFlow DeiT model.""" + +from __future__ import annotations + +import inspect +import unittest + +import numpy as np + +from transformers import DeiTConfig +from transformers.testing_utils import require_tf, require_vision, slow +from transformers.utils import cached_property, is_tf_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import ( + TFDeiTForImageClassification, + TFDeiTForImageClassificationWithTeacher, + TFDeiTForMaskedImageModeling, + TFDeiTModel, + ) + from transformers.modeling_tf_utils import keras + + +if is_vision_available(): + from PIL import Image + + from transformers import DeiTImageProcessor + + +class TFDeiTModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + type_sequence_label_size=10, + initializer_range=0.02, + num_labels=3, + scope=None, + encoder_stride=2, + attn_implementation="eager", + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.scope = scope + self.encoder_stride = encoder_stride + self.attn_implementation = attn_implementation + + # in DeiT, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distilation tokens) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 2 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DeiTConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + encoder_stride=self.encoder_stride, + attn_implementation=self.attn_implementation, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = TFDeiTModel(config=config) + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels): + model = TFDeiTForMaskedImageModeling(config=config) + result = model(pixel_values) + self.parent.assertEqual( + result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size) + ) + + # test greyscale images + config.num_channels = 1 + model = TFDeiTForMaskedImageModeling(config) + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = TFDeiTForImageClassification(config) + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = TFDeiTForImageClassification(config) + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class TFDeiTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_tf_common.py, as DeiT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + ( + TFDeiTModel, + TFDeiTForImageClassification, + TFDeiTForImageClassificationWithTeacher, + TFDeiTForMaskedImageModeling, + ) + if is_tf_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": TFDeiTModel, + "image-classification": (TFDeiTForImageClassification, TFDeiTForImageClassificationWithTeacher), + } + if is_tf_available() + else {} + ) + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFDeiTModelTester(self) + self.config_tester = ConfigTester(self, config_class=DeiTConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="DeiT does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, keras.layers.Dense)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_image_modeling(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + # special case for DeiTForImageClassificationWithTeacher model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if "labels" in inputs_dict and "labels" not in inspect.signature(model_class.call).parameters: + del inputs_dict["labels"] + + return inputs_dict + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/deit-base-distilled-patch16-224" + model = TFDeiTModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_tf +@require_vision +class DeiTModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224") + if is_vision_available() + else None + ) + + @slow + def test_inference_image_classification_head(self): + model = TFDeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224") + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="tf") + + # forward pass + outputs = model(**inputs) + + # verify the logits + expected_shape = tf.TensorShape((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = tf.constant([-1.0266, 0.1912, -1.2861]) + + self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)) + + @slow + def test_inference_interpolate_pos_encoding(self): + model = TFDeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224") + + image_processor = self.default_image_processor + # image size is {"height": 480, "width": 640} + image = prepare_img() + image_processor.size = {"height": 480, "width": 640} + # center crop set to False so image is not center cropped to 224x224 + inputs = image_processor(images=image, return_tensors="tf", do_center_crop=False) + # forward pass + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = tf.TensorShape((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) diff --git a/docs/transformers/tests/models/depth_anything/__init__.py b/docs/transformers/tests/models/depth_anything/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/depth_anything/test_modeling_depth_anything.py b/docs/transformers/tests/models/depth_anything/test_modeling_depth_anything.py new file mode 100644 index 0000000000000000000000000000000000000000..bf9bc907dc83d01a73238bd0db1a1e580f00a872 --- /dev/null +++ b/docs/transformers/tests/models/depth_anything/test_modeling_depth_anything.py @@ -0,0 +1,312 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Depth Anything model.""" + +import unittest + +from transformers import DepthAnythingConfig, Dinov2Config +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4 +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import DepthAnythingForDepthEstimation + + +if is_vision_available(): + from PIL import Image + + from transformers import DPTImageProcessor + + +class DepthAnythingModelTester: + # Copied from tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTester.__init__ + def __init__( + self, + parent, + batch_size=2, + num_channels=3, + image_size=32, + patch_size=16, + use_labels=True, + num_labels=3, + is_training=True, + hidden_size=4, + num_hidden_layers=2, + num_attention_heads=2, + intermediate_size=8, + out_features=["stage1", "stage2"], + apply_layernorm=False, + reshape_hidden_states=False, + neck_hidden_sizes=[2, 2], + fusion_hidden_size=6, + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.patch_size = patch_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.out_features = out_features + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + self.use_labels = use_labels + self.num_labels = num_labels + self.is_training = is_training + self.neck_hidden_sizes = neck_hidden_sizes + self.fusion_hidden_size = fusion_hidden_size + # DPT's sequence length + self.seq_length = (self.image_size // self.patch_size) ** 2 + 1 + + # Copied from tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTester.prepare_config_and_inputs + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DepthAnythingConfig( + backbone_config=self.get_backbone_config(), + reassemble_hidden_size=self.hidden_size, + patch_size=self.patch_size, + neck_hidden_sizes=self.neck_hidden_sizes, + fusion_hidden_size=self.fusion_hidden_size, + ) + + # Copied from tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTester.get_backbone_config + def get_backbone_config(self): + return Dinov2Config( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + is_training=self.is_training, + out_features=self.out_features, + reshape_hidden_states=self.reshape_hidden_states, + ) + + # Copied from tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTester.create_and_check_for_depth_estimation with DPT->DepthAnything + def create_and_check_for_depth_estimation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DepthAnythingForDepthEstimation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + + # Copied from tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTester.prepare_config_and_inputs_for_common + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DepthAnythingModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Depth Anything does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (DepthAnythingForDepthEstimation,) if is_torch_available() else () + pipeline_model_mapping = {"depth-estimation": DepthAnythingForDepthEstimation} if is_torch_available() else {} + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = DepthAnythingModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=DepthAnythingConfig, + has_text_modality=False, + hidden_size=37, + common_properties=["patch_size"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Depth Anything with AutoBackbone does not have a base model and hence no input_embeddings") + def test_inputs_embeds(self): + pass + + def test_for_depth_estimation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + + @unittest.skip(reason="Depth Anything does not support training yet") + def test_training(self): + pass + + @unittest.skip(reason="Depth Anything does not support training yet") + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip(reason="Depth Anything with AutoBackbone does not have a base model and hence no input_embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip( + reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "LiheYoung/depth-anything-small-hf" + model = DepthAnythingForDepthEstimation.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_backbone_selection(self): + def _validate_backbone_init(): + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + # Confirm out_indices propagated to backbone + self.assertEqual(len(model.backbone.out_indices), 2) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Load a timm backbone + config.backbone = "resnet18" + config.use_pretrained_backbone = True + config.use_timm_backbone = True + config.backbone_config = None + # For transformer backbones we can't set the out_indices or just return the features + config.backbone_kwargs = {"out_indices": (-2, -1)} + _validate_backbone_init() + + # Load a HF backbone + config.backbone = "facebook/dinov2-small" + config.use_pretrained_backbone = True + config.use_timm_backbone = False + config.backbone_config = None + config.backbone_kwargs = {"out_indices": [-2, -1]} + _validate_backbone_init() + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +@slow +class DepthAnythingModelIntegrationTest(unittest.TestCase): + def test_inference(self): + # -- `relative` depth model -- + image_processor = DPTImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf") + model = DepthAnythingForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size([1, 518, 686]) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[8.8223, 8.6483, 8.6216], [8.3332, 8.6047, 8.7545], [8.6547, 8.6885, 8.7472]], + ).to(torch_device) + + torch.testing.assert_close(predicted_depth[0, :3, :3], expected_slice, rtol=1e-6, atol=1e-6) + + # -- `metric` depth model -- + image_processor = DPTImageProcessor.from_pretrained("depth-anything/depth-anything-V2-metric-indoor-small-hf") + model = DepthAnythingForDepthEstimation.from_pretrained( + "depth-anything/depth-anything-V2-metric-indoor-small-hf" + ).to(torch_device) + + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size([1, 518, 686]) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[1.3349, 1.2947, 1.2802], [1.2794, 1.2338, 1.2901], [1.2630, 1.2219, 1.2478]], + ).to(torch_device) + + torch.testing.assert_close(predicted_depth[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + def test_export(self): + for strict in [True, False]: + with self.subTest(strict=strict): + if not is_torch_greater_or_equal_than_2_4: + self.skipTest(reason="This test requires torch >= 2.4 to run.") + model = ( + DepthAnythingForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf") + .to(torch_device) + .eval() + ) + image_processor = DPTImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf") + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + exported_program = torch.export.export( + model, + args=(inputs["pixel_values"],), + strict=strict, + ) + with torch.no_grad(): + eager_outputs = model(**inputs) + exported_outputs = exported_program.module().forward(inputs["pixel_values"]) + self.assertEqual(eager_outputs.predicted_depth.shape, exported_outputs.predicted_depth.shape) + self.assertTrue( + torch.allclose(eager_outputs.predicted_depth, exported_outputs.predicted_depth, atol=1e-4) + ) diff --git a/docs/transformers/tests/models/depth_pro/__init__.py b/docs/transformers/tests/models/depth_pro/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/depth_pro/test_image_processing_depth_pro.py b/docs/transformers/tests/models/depth_pro/test_image_processing_depth_pro.py new file mode 100644 index 0000000000000000000000000000000000000000..a14b60617150a82c904ba26b5feb80ec5ebd3d17 --- /dev/null +++ b/docs/transformers/tests/models/depth_pro/test_image_processing_depth_pro.py @@ -0,0 +1,123 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.testing_utils import is_flaky, require_torch, require_vision +from transformers.utils import is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_vision_available(): + from transformers import DepthProImageProcessor + + if is_torchvision_available(): + from transformers import DepthProImageProcessorFast + + +class DepthProImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_rescale=True, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + super().__init__() + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_rescale = do_rescale + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_rescale": self.do_rescale, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class DepthProImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = DepthProImageProcessor if is_vision_available() else None + fast_image_processing_class = DepthProImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = DepthProImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "resample")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + @is_flaky( + description="fast and slow, both processors use torch implementation, see: https://github.com/huggingface/transformers/issues/34920", + ) + def test_fast_is_faster_than_slow(self): + super().test_fast_is_faster_than_slow() diff --git a/docs/transformers/tests/models/depth_pro/test_modeling_depth_pro.py b/docs/transformers/tests/models/depth_pro/test_modeling_depth_pro.py new file mode 100644 index 0000000000000000000000000000000000000000..bd35887e585bc3bfda117642839b6293a2b7c106 --- /dev/null +++ b/docs/transformers/tests/models/depth_pro/test_modeling_depth_pro.py @@ -0,0 +1,398 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DepthPro model.""" + +import unittest + +from transformers import DepthProConfig +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import DepthProForDepthEstimation, DepthProModel + from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES + + +if is_vision_available(): + from PIL import Image + + from transformers import DepthProImageProcessor + + +class DepthProModelTester: + def __init__( + self, + parent, + batch_size=8, + image_size=64, + patch_size=16, + num_channels=3, + is_training=True, + use_labels=True, + fusion_hidden_size=16, + intermediate_hook_ids=[1, 0], + intermediate_feature_dims=[10, 8], + scaled_images_ratios=[0.5, 1.0], + scaled_images_overlap_ratios=[0.0, 0.2], + scaled_images_feature_dims=[12, 12], + initializer_range=0.02, + use_fov_model=False, + image_model_config={ + "model_type": "dinov2", + "num_hidden_layers": 2, + "hidden_size": 16, + "num_attention_heads": 1, + "patch_size": 4, + }, + patch_model_config={ + "model_type": "vit", + "num_hidden_layers": 2, + "hidden_size": 24, + "num_attention_heads": 2, + "patch_size": 6, + }, + fov_model_config={ + "model_type": "vit", + "num_hidden_layers": 2, + "hidden_size": 32, + "num_attention_heads": 4, + "patch_size": 8, + }, + num_labels=3, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.fusion_hidden_size = fusion_hidden_size + self.intermediate_hook_ids = intermediate_hook_ids + self.intermediate_feature_dims = intermediate_feature_dims + self.scaled_images_ratios = scaled_images_ratios + self.scaled_images_overlap_ratios = scaled_images_overlap_ratios + self.scaled_images_feature_dims = scaled_images_feature_dims + self.initializer_range = initializer_range + self.use_fov_model = use_fov_model + self.image_model_config = image_model_config + self.patch_model_config = patch_model_config + self.fov_model_config = fov_model_config + self.num_labels = num_labels + + self.hidden_size = image_model_config["hidden_size"] + self.num_hidden_layers = image_model_config["num_hidden_layers"] + self.num_attention_heads = image_model_config["num_attention_heads"] + + # may be different for a backbone other than dinov2 + self.out_size = patch_size // image_model_config["patch_size"] + self.seq_length = self.out_size**2 + 1 # we add 1 for the [CLS] token + + n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) + self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * self.out_size + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DepthProConfig( + patch_size=self.patch_size, + fusion_hidden_size=self.fusion_hidden_size, + intermediate_hook_ids=self.intermediate_hook_ids, + intermediate_feature_dims=self.intermediate_feature_dims, + scaled_images_ratios=self.scaled_images_ratios, + scaled_images_overlap_ratios=self.scaled_images_overlap_ratios, + scaled_images_feature_dims=self.scaled_images_feature_dims, + initializer_range=self.initializer_range, + image_model_config=self.image_model_config, + patch_model_config=self.patch_model_config, + fov_model_config=self.fov_model_config, + use_fov_model=self.use_fov_model, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DepthProModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_depth_estimation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DepthProForDepthEstimation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual( + result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size) + ) + + def create_and_check_for_fov(self, config, pixel_values, labels): + model = DepthProForDepthEstimation(config, use_fov_model=True) + model.to(torch_device) + model.eval() + + # check if the fov_model (DinoV2-based encoder) is created + self.parent.assertIsNotNone(model.fov_model) + + batched_pixel_values = pixel_values + row_pixel_values = pixel_values[:1] + + with torch.no_grad(): + model_batched_output_fov = model(batched_pixel_values).field_of_view + model_row_output_fov = model(row_pixel_values).field_of_view + + # check if fov is returned + self.parent.assertIsNotNone(model_batched_output_fov) + self.parent.assertIsNotNone(model_row_output_fov) + + # check output shape consistency for fov + self.parent.assertEqual(model_batched_output_fov.shape, (self.batch_size,)) + + # check equivalence between batched and single row outputs for fov + diff = torch.max(torch.abs(model_row_output_fov - model_batched_output_fov[:1])) + model_name = model.__class__.__name__ + self.parent.assertTrue( + diff <= 1e-03, + msg=(f"Batched and Single row outputs are not equal in {model_name} for fov. Difference={diff}."), + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DepthProModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as DepthPro does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (DepthProModel, DepthProForDepthEstimation) if is_torch_available() else () + pipeline_model_mapping = ( + { + "depth-estimation": DepthProForDepthEstimation, + "image-feature-extraction": DepthProModel, + } + if is_torch_available() + else {} + ) + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = DepthProModelTester(self) + self.config_tester = ConfigTester(self, config_class=DepthProConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="DepthPro does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_depth_estimation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + + def test_for_fov(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_fov(*config_and_inputs) + + def test_training(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DepthProForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values(): + continue + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DepthProForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing: + continue + model = model_class(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + non_uniform_init_parms = [ + # these encoders are vision transformers + # any layer outside these encoders is either Conv2d or ConvTranspose2d + # which use kaiming initialization + "patch_encoder", + "image_encoder", + "fov_model.encoder", + ] + if param.requires_grad: + if any(x in name for x in non_uniform_init_parms): + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertTrue( + -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + # this started when switched from normal initialization to kaiming_normal initialization + # maybe because the magnitude of offset values from ViT-encoders increases when followed by many convolution layers + def test_batching_equivalence(self, atol=1e-4, rtol=1e-4): + super().test_batching_equivalence(atol=atol, rtol=rtol) + + @slow + def test_model_from_pretrained(self): + model_path = "apple/DepthPro-hf" + model = DepthProModel.from_pretrained(model_path) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +@slow +class DepthProModelIntegrationTest(unittest.TestCase): + def test_inference_depth_estimation(self): + model_path = "apple/DepthPro-hf" + image_processor = DepthProImageProcessor.from_pretrained(model_path) + model = DepthProForDepthEstimation.from_pretrained(model_path).to(torch_device) + config = model.config + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the predicted depth + n_fusion_blocks = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) + out_size = config.image_model_config.image_size // config.image_model_config.patch_size + expected_depth_size = 2 ** (n_fusion_blocks + 1) * out_size + + expected_shape = torch.Size((1, expected_depth_size, expected_depth_size)) + self.assertEqual(outputs.predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[1.0582, 1.1225, 1.1335], [1.1154, 1.1398, 1.1486], [1.1434, 1.1500, 1.1643]] + ).to(torch_device) + torch.testing.assert_close(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4, rtol=1e-4) + + # verify the predicted fov + expected_shape = torch.Size((1,)) + self.assertEqual(outputs.field_of_view.shape, expected_shape) + + expected_slice = torch.tensor([47.2459]).to(torch_device) + torch.testing.assert_close(outputs.field_of_view, expected_slice, atol=1e-4, rtol=1e-4) + + def test_post_processing_depth_estimation(self): + model_path = "apple/DepthPro-hf" + image_processor = DepthProImageProcessor.from_pretrained(model_path) + model = DepthProForDepthEstimation.from_pretrained(model_path) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt") + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + outputs = image_processor.post_process_depth_estimation( + outputs, + target_sizes=[[image.height, image.width]], + ) + predicted_depth = outputs[0]["predicted_depth"] + expected_shape = torch.Size((image.height, image.width)) + self.assertTrue(predicted_depth.shape == expected_shape) diff --git a/docs/transformers/tests/models/detr/__init__.py b/docs/transformers/tests/models/detr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/detr/test_image_processing_detr.py b/docs/transformers/tests/models/detr/test_image_processing_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..deba250ae15866b594142b93e81273171e5c4611 --- /dev/null +++ b/docs/transformers/tests/models/detr/test_image_processing_detr.py @@ -0,0 +1,789 @@ +# Copyright 2021 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import pathlib +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_torch_gpu, require_torchvision, require_vision, slow +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import DetrImageProcessor + + if is_torchvision_available(): + from transformers import DetrImageProcessorFast + + +class DetrImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_rescale=True, + rescale_factor=1 / 255, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_pad=True, + ): + # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p + size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_pad = do_pad + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_rescale": self.do_rescale, + "rescale_factor": self.rescale_factor, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_pad": self.do_pad, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to DetrImageProcessor, + assuming do_resize is set to True with a scalar size. + """ + if not batched: + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] + else: + h, w = image.shape[1], image.shape[2] + if w < h: + expected_height = int(self.size["shortest_edge"] * h / w) + expected_width = self.size["shortest_edge"] + elif w > h: + expected_height = self.size["shortest_edge"] + expected_width = int(self.size["shortest_edge"] * w / h) + else: + expected_height = self.size["shortest_edge"] + expected_width = self.size["shortest_edge"] + + else: + expected_values = [] + for image in image_inputs: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + def expected_output_image_shape(self, images): + height, width = self.get_expected_values(images, batched=True) + return self.num_channels, height, width + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = DetrImageProcessor if is_vision_available() else None + fast_image_processing_class = DetrImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = DetrImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_pad")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) + self.assertEqual(image_processor.do_pad, True) + + image_processor = image_processing_class.from_dict( + self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False + ) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) + self.assertEqual(image_processor.do_pad, False) + + def test_should_raise_if_annotation_format_invalid(self): + image_processor_dict = self.image_processor_tester.prepare_image_processor_dict() + + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + detection_target = json.loads(f.read()) + + annotations = {"image_id": 39769, "annotations": detection_target} + + params = { + "images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), + "annotations": annotations, + "return_tensors": "pt", + } + + image_processor_params = {**image_processor_dict, **{"format": "_INVALID_FORMAT_"}} + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class(**image_processor_params) + + with self.assertRaises(ValueError) as e: + image_processor(**params) + + self.assertTrue(str(e.exception).startswith("_INVALID_FORMAT_ is not a valid AnnotationFormat")) + + def test_valid_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + target = json.loads(f.read()) + + params = {"image_id": 39769, "annotations": target} + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50") + + # legal encodings (single image) + _ = image_processing(images=image, annotations=params, return_tensors="pt") + _ = image_processing(images=image, annotations=[params], return_tensors="pt") + + # legal encodings (batch of one image) + _ = image_processing(images=[image], annotations=params, return_tensors="pt") + _ = image_processing(images=[image], annotations=[params], return_tensors="pt") + + # legal encoding (batch of more than one image) + n = 5 + _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt") + + # example of an illegal encoding (missing the 'image_id' key) + with self.assertRaises(ValueError) as e: + image_processing(images=image, annotations={"annotations": target}, return_tensors="pt") + + self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations")) + + # example of an illegal encoding (unequal lengths of images and annotations) + with self.assertRaises(ValueError) as e: + image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt") + + self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.") + + @slow + def test_call_pytorch_with_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50") + encoding = image_processing(images=image, annotations=target, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + # verify area + expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) + torch.testing.assert_close(encoding["labels"][0]["area"], expected_area) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) + torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3) + # verify image_id + expected_image_id = torch.tensor([39769]) + torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd) + # verify class_labels + expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) + torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size) + # verify size + expected_size = torch.tensor([800, 1066]) + torch.testing.assert_close(encoding["labels"][0]["size"], expected_size) + + @slow + def test_call_pytorch_with_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class.from_pretrained("facebook/detr-resnet-50-panoptic") + encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + # verify area + expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) + torch.testing.assert_close(encoding["labels"][0]["area"], expected_area) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) + torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3) + # verify image_id + expected_image_id = torch.tensor([39769]) + torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd) + # verify class_labels + expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) + torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels) + # verify masks + expected_masks_sum = 822873 + relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum + self.assertTrue(relative_error < 1e-3) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size) + # verify size + expected_size = torch.tensor([800, 1066]) + torch.testing.assert_close(encoding["labels"][0]["size"], expected_size) + + @slow + def test_batched_coco_detection_annotations(self): + image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800)) + + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + target = json.loads(f.read()) + + annotations_0 = {"image_id": 39769, "annotations": target} + annotations_1 = {"image_id": 39769, "annotations": target} + + # Adjust the bounding boxes for the resized image + w_0, h_0 = image_0.size + w_1, h_1 = image_1.size + for i in range(len(annotations_1["annotations"])): + coords = annotations_1["annotations"][i]["bbox"] + new_bbox = [ + coords[0] * w_1 / w_0, + coords[1] * h_1 / h_0, + coords[2] * w_1 / w_0, + coords[3] * h_1 / h_0, + ] + annotations_1["annotations"][i]["bbox"] = new_bbox + + images = [image_0, image_1] + annotations = [annotations_0, annotations_1] + + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class() + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + return_tensors="pt", # do_convert_annotations=True + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.6879, 0.4609, 0.0755, 0.3691], + [0.2118, 0.3359, 0.2601, 0.1566], + [0.5011, 0.5000, 0.9979, 1.0000], + [0.5010, 0.5020, 0.9979, 0.9959], + [0.3284, 0.5944, 0.5884, 0.8112], + [0.8394, 0.5445, 0.3213, 0.9110], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.4130, 0.2765, 0.0453, 0.2215], + [0.1272, 0.2016, 0.1561, 0.0940], + [0.3757, 0.4933, 0.7488, 0.9865], + [0.3759, 0.5002, 0.7492, 0.9955], + [0.1971, 0.5456, 0.3532, 0.8646], + [0.5790, 0.4115, 0.3430, 0.7161], + ] + ) + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3, rtol=1e-3) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1) + + def test_batched_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800)) + + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f: + target = json.loads(f.read()) + + annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + w_0, h_0 = image_0.size + w_1, h_1 = image_1.size + for i in range(len(annotation_1["segments_info"])): + coords = annotation_1["segments_info"][i]["bbox"] + new_bbox = [ + coords[0] * w_1 / w_0, + coords[1] * h_1 / h_0, + coords[2] * w_1 / w_0, + coords[3] * h_1 / h_0, + ] + annotation_1["segments_info"][i]["bbox"] = new_bbox + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + images = [image_0, image_1] + annotations = [annotation_0, annotation_1] + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class(format="coco_panoptic") + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_tensors="pt", + return_segmentation_masks=True, + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.2625, 0.5437, 0.4688, 0.8625], + [0.7719, 0.4104, 0.4531, 0.7125], + [0.5000, 0.4927, 0.9969, 0.9854], + [0.1688, 0.2000, 0.2063, 0.0917], + [0.5492, 0.2760, 0.0578, 0.2187], + [0.4992, 0.4990, 0.9984, 0.9979], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.1576, 0.3262, 0.2814, 0.5175], + [0.4634, 0.2463, 0.2720, 0.4275], + [0.3002, 0.2956, 0.5985, 0.5913], + [0.1013, 0.1200, 0.1238, 0.0550], + [0.3297, 0.1656, 0.0347, 0.1312], + [0.2997, 0.2994, 0.5994, 0.5987], + ] + ) + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3, rtol=1e-3) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1) + + def test_max_width_max_height_resizing_and_pad_strategy(self): + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = image_processing_class( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + + def test_longest_edge_shortest_edge_resizing_strategy(self): + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) + + # max size is set; width < height; + # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 + image_processor = image_processing_class( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436])) + + image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) + # max size is set; height < width; + # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 + image_processor = image_processing_class( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640])) + + image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) + # max size is set; width == size; height > max_size; + # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 + image_processor = image_processing_class( + size={"longest_edge": 118, "shortest_edge": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_3], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118])) + + image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) + # max size is set; height == size; width < max_size; + # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 + image_processor = image_processing_class( + size={"longest_edge": 256, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_4], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50])) + + image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) + # max size is set; height == width; width < max_size; + # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 + image_processor = image_processing_class( + size={"longest_edge": 117, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_5], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50])) + + @slow + @require_torch_gpu + @require_torchvision + def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + processor = self.image_processor_list[1]() + # 1. run processor on CPU + encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu") + # 2. run processor on GPU + encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda") + + # verify pixel values + self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["pixel_values"][0, 0, 0, :3], + encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"), + atol=1e-4, + ) + ) + # verify area + torch.testing.assert_close(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu")) + # verify boxes + self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3 + ) + ) + # verify image_id + torch.testing.assert_close( + encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu") + ) + # verify is_crowd + torch.testing.assert_close( + encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu") + ) + # verify class_labels + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu") + ) + ) + # verify orig_size + torch.testing.assert_close( + encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu") + ) + # verify size + torch.testing.assert_close(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu")) + + @slow + @require_torch_gpu + @require_torchvision + def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + processor = self.image_processor_list[1](format="coco_panoptic") + # 1. run processor on CPU + encoding_cpu = processor( + images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu" + ) + # 2. run processor on GPU + encoding_gpu = processor( + images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cuda" + ) + + # verify pixel values + self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["pixel_values"][0, 0, 0, :3], + encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"), + atol=1e-4, + ) + ) + # verify area + torch.testing.assert_close(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu")) + # verify boxes + self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3 + ) + ) + # verify image_id + torch.testing.assert_close( + encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu") + ) + # verify is_crowd + torch.testing.assert_close( + encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu") + ) + # verify class_labels + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu") + ) + ) + # verify masks + masks_sum_cpu = encoding_cpu["labels"][0]["masks"].sum() + masks_sum_gpu = encoding_gpu["labels"][0]["masks"].sum() + relative_error = torch.abs(masks_sum_cpu - masks_sum_gpu) / masks_sum_cpu + self.assertTrue(relative_error < 1e-3) + # verify orig_size + torch.testing.assert_close( + encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu") + ) + # verify size + torch.testing.assert_close(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu")) diff --git a/docs/transformers/tests/models/detr/test_modeling_detr.py b/docs/transformers/tests/models/detr/test_modeling_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..6d31cdc65dbf6e984c2d5ab80f035223b09f0f8c --- /dev/null +++ b/docs/transformers/tests/models/detr/test_modeling_detr.py @@ -0,0 +1,720 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DETR model.""" + +import inspect +import math +import unittest + +from transformers import DetrConfig, ResNetConfig, is_torch_available, is_vision_available +from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import DetrForObjectDetection, DetrForSegmentation, DetrModel + + +if is_vision_available(): + from PIL import Image + + from transformers import DetrImageProcessor + + +class DetrModelTester: + def __init__( + self, + parent, + batch_size=8, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=8, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_queries=12, + num_channels=3, + min_size=200, + max_size=200, + n_targets=8, + num_labels=91, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_queries = num_queries + self.num_channels = num_channels + self.min_size = min_size + self.max_size = max_size + self.n_targets = n_targets + self.num_labels = num_labels + + # we also set the expected seq length for both encoder and decoder + self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32) + self.decoder_seq_length = self.num_queries + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]) + + pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device) + + labels = None + if self.use_labels: + # labels is a list of Dict (each Dict being the labels for a given example in the batch) + labels = [] + for i in range(self.batch_size): + target = {} + target["class_labels"] = torch.randint( + high=self.num_labels, size=(self.n_targets,), device=torch_device + ) + target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device) + target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device) + labels.append(target) + + config = self.get_config() + return config, pixel_values, pixel_mask, labels + + def get_config(self): + resnet_config = ResNetConfig( + num_channels=3, + embeddings_size=10, + hidden_sizes=[10, 20, 30, 40], + depths=[1, 1, 2, 1], + hidden_act="relu", + num_labels=3, + out_features=["stage2", "stage3", "stage4"], + out_indices=[2, 3, 4], + ) + return DetrConfig( + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + num_queries=self.num_queries, + num_labels=self.num_labels, + use_timm_backbone=False, + backbone_config=resnet_config, + backbone=None, + use_pretrained_backbone=False, + ) + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask} + return config, inputs_dict + + def create_and_check_detr_model(self, config, pixel_values, pixel_mask, labels): + model = DetrModel(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size) + ) + + def create_and_check_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels): + model = DetrForObjectDetection(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + result = model(pixel_values) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels) + + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + +@require_torch +class DetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + DetrModel, + DetrForObjectDetection, + DetrForSegmentation, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "image-feature-extraction": DetrModel, + "image-segmentation": DetrForSegmentation, + "object-detection": DetrForObjectDetection, + } + if is_torch_available() + else {} + ) + is_encoder_decoder = True + test_torchscript = False + test_pruning = False + test_head_masking = False + test_missing_keys = False + zero_init_hidden_state = True + test_torch_exportable = True + + # special case for head models + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ in ["DetrForObjectDetection", "DetrForSegmentation"]: + labels = [] + for i in range(self.model_tester.batch_size): + target = {} + target["class_labels"] = torch.ones( + size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long + ) + target["boxes"] = torch.ones( + self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float + ) + target["masks"] = torch.ones( + self.model_tester.n_targets, + self.model_tester.min_size, + self.model_tester.max_size, + device=torch_device, + dtype=torch.float, + ) + labels.append(target) + inputs_dict["labels"] = labels + + return inputs_dict + + def setUp(self): + self.model_tester = DetrModelTester(self) + self.config_tester = ConfigTester(self, config_class=DetrConfig, has_text_modality=False) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_detr_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_detr_model(*config_and_inputs) + + def test_detr_object_detection_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_detr_object_detection_head_model(*config_and_inputs) + + # TODO: check if this works again for PyTorch 2.x.y + @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.") + def test_multi_gpu_data_parallel_forward(self): + pass + + @unittest.skip(reason="DETR does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="DETR does not use inputs_embeds") + def test_inputs_embeds_matches_input_ids(self): + pass + + @unittest.skip(reason="DETR does not have a get_input_embeddings method") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="DETR is not a generative model") + def test_generate_without_input_ids(self): + pass + + @unittest.skip(reason="DETR does not use token embeddings") + def test_resize_tokens_embeddings(self): + pass + + @slow + @unittest.skip(reason="TODO Niels: fix me!") + def test_model_outputs_equivalence(self): + pass + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + decoder_seq_length = self.model_tester.decoder_seq_length + encoder_seq_length = self.model_tester.encoder_seq_length + decoder_key_length = self.model_tester.decoder_seq_length + encoder_key_length = self.model_tester.encoder_seq_length + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + out_len = len(outputs) + + if self.is_encoder_decoder: + correct_outlen = 5 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Object Detection model returns pred_logits and pred_boxes + if model_class.__name__ == "DetrForObjectDetection": + correct_outlen += 2 + # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks + if model_class.__name__ == "DetrForSegmentation": + correct_outlen += 3 + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + encoder_key_length, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + elif self.is_encoder_decoder: + added_hidden_states = 2 + else: + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + + def test_retain_grad_hidden_states_attentions(self): + # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + output = outputs[0] + + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_attentions = outputs.encoder_attentions[0] + encoder_hidden_states.retain_grad() + encoder_attentions.retain_grad() + + decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions.retain_grad() + + cross_attentions = outputs.cross_attentions[0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + + def test_forward_auxiliary_loss(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.auxiliary_loss = True + + # only test for object detection and segmentation model + for model_class in self.all_model_classes[1:]: + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + + outputs = model(**inputs) + + self.assertIsNotNone(outputs.auxiliary_outputs) + self.assertEqual(len(outputs.auxiliary_outputs), self.model_tester.num_hidden_layers - 1) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + if model.config.is_encoder_decoder: + expected_arg_names = ["pixel_values", "pixel_mask"] + expected_arg_names.extend( + ["head_mask", "decoder_head_mask", "encoder_outputs"] + if "head_mask" and "decoder_head_mask" in arg_names + else [] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + else: + expected_arg_names = ["pixel_values", "pixel_mask"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_different_timm_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # let's pick a random timm backbone + config.backbone = "tf_mobilenetv3_small_075" + config.backbone_config = None + config.use_timm_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "DetrForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels + 1, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + elif model_class.__name__ == "DetrForSegmentation": + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + else: + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3) + + self.assertTrue(outputs) + + def test_hf_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Load a pretrained HF checkpoint as backbone + config.backbone = "microsoft/resnet-18" + config.backbone_config = None + config.use_timm_backbone = False + config.use_pretrained_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "DetrForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels + 1, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + elif model_class.__name__ == "DetrForSegmentation": + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3) + else: + # Confirm out_indices was propagated to backbone + self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3) + + self.assertTrue(outputs) + + def test_greyscale_images(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # use greyscale pixel values + inputs_dict["pixel_values"] = floats_tensor( + [self.model_tester.batch_size, 1, self.model_tester.min_size, self.model_tester.max_size] + ) + + # let's set num_channels to 1 + config.num_channels = 1 + config.backbone_config.num_channels = 1 + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertTrue(outputs) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + configs_no_init.init_xavier_std = 1e9 + + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + if "bbox_attention" in name and "bias" not in name: + self.assertLess( + 100000, + abs(param.data.max().item()), + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + +TOLERANCE = 1e-4 + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_timm +@require_vision +@slow +class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase): + @cached_property + def default_image_processor(self): + return DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") if is_vision_available() else None + + def test_inference_no_head(self): + model = DetrModel.from_pretrained("facebook/detr-resnet-50").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + + with torch.no_grad(): + outputs = model(**encoding) + + expected_shape = torch.Size((1, 100, 256)) + assert outputs.last_hidden_state.shape == expected_shape + expected_slice = torch.tensor( + [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]] + ).to(torch_device) + torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + def test_inference_object_detection_head(self): + model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + # verify outputs + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + expected_slice_logits = torch.tensor( + [[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]] + ).to(torch_device) + torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, rtol=1e-4, atol=1e-4) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + expected_slice_boxes = torch.tensor( + [[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]] + ).to(torch_device) + torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, rtol=1e-4, atol=1e-4) + + # verify postprocessing + results = image_processor.post_process_object_detection( + outputs, threshold=0.3, target_sizes=[image.size[::-1]] + )[0] + expected_scores = torch.tensor([0.9982, 0.9960, 0.9955, 0.9988, 0.9987]).to(torch_device) + expected_labels = [75, 75, 63, 17, 17] + expected_slice_boxes = torch.tensor([40.1633, 70.8115, 175.5471, 117.9841]).to(torch_device) + + self.assertEqual(len(results["scores"]), 5) + torch.testing.assert_close(results["scores"], expected_scores, rtol=1e-4, atol=1e-4) + self.assertSequenceEqual(results["labels"].tolist(), expected_labels) + torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes) + + def test_inference_panoptic_segmentation_head(self): + model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + pixel_values = encoding["pixel_values"].to(torch_device) + pixel_mask = encoding["pixel_mask"].to(torch_device) + + with torch.no_grad(): + outputs = model(pixel_values, pixel_mask) + + # verify outputs + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels + 1)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + expected_slice_logits = torch.tensor( + [[-18.1565, -1.7568, -13.5029], [-16.8888, -1.4138, -14.1028], [-17.5709, -2.5080, -11.8654]] + ).to(torch_device) + torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice_logits, rtol=1e-4, atol=1e-4) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + expected_slice_boxes = torch.tensor( + [[0.5344, 0.1789, 0.9285], [0.4420, 0.0572, 0.0875], [0.6630, 0.6887, 0.1017]] + ).to(torch_device) + torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, rtol=1e-4, atol=1e-4) + + expected_shape_masks = torch.Size((1, model.config.num_queries, 200, 267)) + self.assertEqual(outputs.pred_masks.shape, expected_shape_masks) + expected_slice_masks = torch.tensor( + [[-7.7558, -10.8788, -11.9797], [-11.8881, -16.4329, -17.7451], [-14.7316, -19.7383, -20.3004]] + ).to(torch_device) + torch.testing.assert_close(outputs.pred_masks[0, 0, :3, :3], expected_slice_masks, rtol=1e-3, atol=1e-3) + + # verify postprocessing + results = image_processor.post_process_panoptic_segmentation( + outputs, threshold=0.3, target_sizes=[image.size[::-1]] + )[0] + + expected_shape = torch.Size([480, 640]) + expected_slice_segmentation = torch.tensor([[4, 4, 4], [4, 4, 4], [4, 4, 4]], dtype=torch.int32).to( + torch_device + ) + expected_number_of_segments = 5 + expected_first_segment = {"id": 1, "label_id": 17, "was_fused": False, "score": 0.994097} + + number_of_unique_segments = len(torch.unique(results["segmentation"])) + self.assertTrue( + number_of_unique_segments, expected_number_of_segments + 1 + ) # we add 1 for the background class + self.assertTrue(results["segmentation"].shape, expected_shape) + torch.testing.assert_close(results["segmentation"][:3, :3], expected_slice_segmentation, rtol=1e-4, atol=1e-4) + self.assertTrue(len(results["segments_info"]), expected_number_of_segments) + + predicted_first_segment = results["segments_info"][0] + self.assertEqual(predicted_first_segment["id"], expected_first_segment["id"]) + self.assertEqual(predicted_first_segment["label_id"], expected_first_segment["label_id"]) + self.assertEqual(predicted_first_segment["was_fused"], expected_first_segment["was_fused"]) + self.assertAlmostEqual(predicted_first_segment["score"], expected_first_segment["score"], places=3) + + +@require_vision +@require_torch +@slow +class DetrModelIntegrationTests(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm") + if is_vision_available() + else None + ) + + def test_inference_no_head(self): + model = DetrModel.from_pretrained("facebook/detr-resnet-50", revision="no_timm").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + encoding = image_processor(images=image, return_tensors="pt").to(torch_device) + + with torch.no_grad(): + outputs = model(**encoding) + + expected_shape = torch.Size((1, 100, 256)) + assert outputs.last_hidden_state.shape == expected_shape + expected_slice = torch.tensor( + [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]] + ).to(torch_device) + torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/diffllama/__init__.py b/docs/transformers/tests/models/diffllama/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/diffllama/test_modeling_diffllama.py b/docs/transformers/tests/models/diffllama/test_modeling_diffllama.py new file mode 100644 index 0000000000000000000000000000000000000000..c738fbf76d1a14c6da009f7b1372b6537110ec8b --- /dev/null +++ b/docs/transformers/tests/models/diffllama/test_modeling_diffllama.py @@ -0,0 +1,883 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DiffLlama model.""" + +import gc +import tempfile +import unittest + +import pytest +from packaging import version +from parameterized import parameterized + +from transformers import AutoTokenizer, DiffLlamaConfig, StaticCache, is_torch_available, set_seed +from transformers.testing_utils import ( + backend_empty_cache, + cleanup, + require_bitsandbytes, + require_flash_attn, + require_read_token, + require_torch, + require_torch_accelerator, + require_torch_gpu, + require_torch_sdpa, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + DiffLlamaForCausalLM, + DiffLlamaForQuestionAnswering, + DiffLlamaForSequenceClassification, + DiffLlamaForTokenClassification, + DiffLlamaModel, + ) + from transformers.models.diffllama.modeling_diffllama import ( + DiffLlamaRotaryEmbedding, + ) + + +class DiffLlamaModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return DiffLlamaConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DiffLlamaModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class DiffLlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + DiffLlamaModel, + DiffLlamaForCausalLM, + DiffLlamaForSequenceClassification, + DiffLlamaForQuestionAnswering, + DiffLlamaForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": DiffLlamaModel, + "text-classification": DiffLlamaForSequenceClassification, + "text-generation": DiffLlamaForCausalLM, + "zero-shot": DiffLlamaForSequenceClassification, + "question-answering": DiffLlamaForQuestionAnswering, + "token-classification": DiffLlamaForTokenClassification, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + fx_compatible = False + + # Need to use `0.8` instead of `0.9` for `test_cpu_offload` + # This is because we are hitting edge cases with the causal_mask buffer + model_split_percents = [0.5, 0.7, 0.8] + + # used in `test_torch_compile_for_training` + _torch_compile_train_cls = DiffLlamaForCausalLM if is_torch_available() else None + + def setUp(self): + self.model_tester = DiffLlamaModelTester(self) + self.config_tester = ConfigTester(self, config_class=DiffLlamaConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_diffllama_sequence_classification_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = DiffLlamaForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_diffllama_sequence_classification_model_for_single_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "single_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = DiffLlamaForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_diffllama_sequence_classification_model_for_multi_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "multi_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor( + [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size + ).to(torch.float) + model = DiffLlamaForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_diffllama_token_classification_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels) + model = DiffLlamaForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=token_labels) + self.assertEqual( + result.logits.shape, + (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels), + ) + + @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) + def test_model_rope_scaling_from_config(self, scaling_type): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + short_input = ids_tensor([1, 10], config.vocab_size) + long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + original_model = DiffLlamaModel(config) + original_model.to(torch_device) + original_model.eval() + original_short_output = original_model(short_input).last_hidden_state + original_long_output = original_model(long_input).last_hidden_state + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + config.rope_scaling = {"type": scaling_type, "factor": 10.0} + scaled_model = DiffLlamaModel(config) + scaled_model.to(torch_device) + scaled_model.eval() + scaled_short_output = scaled_model(short_input).last_hidden_state + scaled_long_output = scaled_model(long_input).last_hidden_state + + # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original + # maximum sequence length, so the outputs for the short input should match. + if scaling_type == "dynamic": + torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) + else: + self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) + + # The output should be different for long inputs + self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + + def test_model_rope_scaling(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + + # Inputs + x = torch.randn( + 1, dtype=torch.float32, device=torch_device + ) # used exclusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Sanity check original RoPE + original_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device) + original_cos_short, original_sin_short = original_rope(x, position_ids_short) + original_cos_long, original_sin_long = original_rope(x, position_ids_long) + torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) + + # Sanity check linear RoPE scaling + # New position "x" should match original position with index "x/scaling_factor" + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + linear_scaling_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device) + linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) + linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) + torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) + for new_position in range(0, long_input_length, scaling_factor): + original_position = int(new_position // scaling_factor) + torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) + torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) + + # Sanity check Dynamic NTK RoPE scaling + # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase + # with scaling_factor (or that `inv_freq` decreases) + config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} + ntk_scaling_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device) + ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) + ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) + torch.testing.assert_close(ntk_cos_short, original_cos_short) + torch.testing.assert_close(ntk_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_sin_long, original_sin_long) + self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) + + # Sanity check Yarn RoPE scaling + # Scaling should be over the entire input + config.rope_scaling = {"type": "yarn", "factor": scaling_factor} + yarn_scaling_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device) + yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) + yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) + torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_short, original_cos_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_long, original_sin_long) + + def test_model_loading_old_rope_configs(self): + def _reinitialize_config(base_config, new_kwargs): + # Reinitialize the config with the new kwargs, forcing the config to go through its __init__ validation + # steps. + base_config_dict = base_config.to_dict() + new_config = DiffLlamaConfig.from_dict(config_dict={**base_config_dict, **new_kwargs}) + return new_config + + # from untouched config -> ✅ + base_config, model_inputs = self.model_tester.prepare_config_and_inputs_for_common() + original_model = DiffLlamaForCausalLM(base_config).to(torch_device) + original_model(**model_inputs) + + # from a config with the expected rope configuration -> ✅ + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0}}) + original_model = DiffLlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with the old rope configuration ('type' instead of 'rope_type') -> ✅ we gracefully handle BC + config = _reinitialize_config(base_config, {"rope_scaling": {"type": "linear", "factor": 10.0}}) + original_model = DiffLlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with both 'type' and 'rope_type' -> ✅ they can coexist (and both are present in the config) + config = _reinitialize_config( + base_config, {"rope_scaling": {"type": "linear", "rope_type": "linear", "factor": 10.0}} + ) + self.assertTrue(config.rope_scaling["type"] == "linear") + self.assertTrue(config.rope_scaling["rope_type"] == "linear") + original_model = DiffLlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with parameters in a bad range ('factor' should be >= 1.0) -> ⚠️ throws a warning + with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": -999.0}}) + original_model = DiffLlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + self.assertEqual(len(logs.output), 1) + self.assertIn("factor field", logs.output[0]) + + # from a config with unknown parameters ('foo' isn't a rope option) -> ⚠️ throws a warning + with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: + config = _reinitialize_config( + base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0, "foo": "bar"}} + ) + original_model = DiffLlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + self.assertEqual(len(logs.output), 1) + self.assertIn("Unrecognized keys", logs.output[0]) + + # from a config with specific rope type but missing one of its mandatory parameters -> ❌ throws exception + with self.assertRaises(KeyError): + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor" + + @require_flash_attn + @require_torch_gpu + @require_bitsandbytes + @pytest.mark.flash_attn_test + @require_read_token + @slow + def test_flash_attn_2_generate_padding_right(self): + """ + Overwriting the common test as the test is flaky on tiny models + """ + model = DiffLlamaForCausalLM.from_pretrained( + "kajuma/DiffLlama-0.3B-handcut", + load_in_4bit=True, + device_map={"": 0}, + ) + + tokenizer = AutoTokenizer.from_pretrained("kajuma/DiffLlama-0.3B-handcut") + + texts = ["hi", "Hello this is a very long sentence"] + + tokenizer.padding_side = "right" + tokenizer.pad_token = tokenizer.eos_token + + inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) + + output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_native = tokenizer.batch_decode(output_native) + + model = DiffLlamaForCausalLM.from_pretrained( + "kajuma/DiffLlama-0.3B-handcut", + load_in_4bit=True, + device_map={"": 0}, + attn_implementation="flash_attention_2", + ) + + output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_fa_2 = tokenizer.batch_decode(output_fa_2) + + self.assertListEqual(output_native, output_fa_2) + + @require_flash_attn + @require_torch_gpu + @slow + @pytest.mark.flash_attn_test + def test_use_flash_attention_2_true(self): + """ + NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended. + """ + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + with tempfile.TemporaryDirectory() as tmp_dir: + model = model_class(config) + model.save_pretrained(tmp_dir) + + new_model = DiffLlamaForCausalLM.from_pretrained( + tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16 + ).to("cuda") + + self.assertTrue(new_model.config._attn_implementation == "flash_attention_2") + + has_flash = False + for name, submodule in new_model.named_modules(): + if "FlashAttention" in submodule.__class__.__name__: + has_flash = True + break + if not has_flash: + raise ValueError("The flash model should have flash attention layers") + + @require_torch_sdpa + @slow + def test_eager_matches_sdpa_generate(self): + """ + Overwriting the common test as the test is flaky on tiny models + """ + max_new_tokens = 30 + + tokenizer = AutoTokenizer.from_pretrained("kajuma/DiffLlama-0.3B-handcut") + + model_sdpa = DiffLlamaForCausalLM.from_pretrained( + "kajuma/DiffLlama-0.3B-handcut", + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = DiffLlamaForCausalLM.from_pretrained( + "kajuma/DiffLlama-0.3B-handcut", + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + + texts = [ + "hi here's a longer context, getting longer and", + "Hello this is a very long sentence my friend, very long for real", + "Today I am in Paris and", + ] + + for padding_side in ["left", "right"]: + tokenizer.padding_side = padding_side + tokenizer.pad_token = tokenizer.eos_token + + inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) + + res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) + res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) + + with self.subTest(f"{padding_side}"): + torch.testing.assert_close( + res_eager, + res_sdpa, + msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", + ) + + +@require_torch_accelerator +class DiffLlamaIntegrationTest(unittest.TestCase): + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + def tearDown(self): + # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed. + cleanup(torch_device, gc_collect=False) + + @slow + @require_torch_accelerator + @require_read_token + def test_compile_static_cache(self): + # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2 + # work as intended. See https://github.com/pytorch/pytorch/issues/121943 + if version.parse(torch.__version__) < version.parse("2.3.0"): + self.skipTest(reason="This test requires torch >= 2.3 to run.") + + NUM_TOKENS_TO_GENERATE = 40 + # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test + # was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs. + EXPECTED_TEXT_COMPLETION = [ + "Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial " + "reference frames, and 2) the laws of physics are the same for all inertial reference frames.\nThe " + "theory of relativ", + "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, " + "my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p", + ] + + prompts = [ + "Simply put, the theory of relativity states that ", + "My favorite all time favorite condiment is ketchup.", + ] + tokenizer = AutoTokenizer.from_pretrained( + "kajuma/DiffLlama-0.3B-handcut", pad_token="", padding_side="right" + ) + model = DiffLlamaForCausalLM.from_pretrained( + "kajuma/DiffLlama-0.3B-handcut", device_map=torch_device, torch_dtype=torch.float16 + ) + inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + + # Dynamic Cache + generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False) + dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, dynamic_text) + + # Static Cache + generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static" + ) + static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text) + + # Static Cache + compile + model._cache = None # clear cache object, initialized when we pass `cache_implementation="static"` + model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) + generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static" + ) + static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text) + + +@slow +@require_torch_accelerator +class Mask4DTestHard(unittest.TestCase): + def tearDown(self): + gc.collect() + backend_empty_cache(torch_device) + + def setUp(self): + model_name = "kajuma/DiffLlama-0.3B-handcut" + self.model_dtype = torch.float32 + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = DiffLlamaForCausalLM.from_pretrained(model_name, torch_dtype=self.model_dtype).to(torch_device) + + def get_test_data(self): + template = "my favorite {}" + items = ("pet is a", "artist plays a", "name is L") # same number of tokens in each item + + batch_separate = [template.format(x) for x in items] # 3 separate lines + batch_shared_prefix = template.format(" ".join(items)) # 1 line with options concatenated + + input_ids = self.tokenizer(batch_separate, return_tensors="pt").input_ids.to(torch_device) + input_ids_shared_prefix = self.tokenizer(batch_shared_prefix, return_tensors="pt").input_ids.to(torch_device) + + mask_shared_prefix = torch.tensor( + [ + [ + [ + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0], + [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0], + [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1], + ] + ] + ], + device=torch_device, + ) + + position_ids = torch.arange(input_ids.shape[1]).tile(input_ids.shape[0], 1).to(torch_device) + + # building custom positions ids based on custom mask + position_ids_shared_prefix = (mask_shared_prefix.sum(dim=-1) - 1).reshape(1, -1) + # effectively: position_ids_shared_prefix = torch.tensor([[0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5]]).to(device) + + # inverting the mask + min_dtype = torch.finfo(self.model_dtype).min + mask_shared_prefix = (mask_shared_prefix.eq(0.0)).to(dtype=self.model_dtype) * min_dtype + + return input_ids, position_ids, input_ids_shared_prefix, mask_shared_prefix, position_ids_shared_prefix + + def test_stacked_causal_mask(self): + ( + input_ids, + position_ids, + input_ids_shared_prefix, + mask_shared_prefix, + position_ids_shared_prefix, + ) = self.get_test_data() + + # regular batch + logits = self.model.forward(input_ids, position_ids=position_ids).logits + logits_last = logits[:, -1, :] # last tokens in each batch line + decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)] + + # single forward run with 4D custom mask + logits_shared_prefix = self.model.forward( + input_ids_shared_prefix, attention_mask=mask_shared_prefix, position_ids=position_ids_shared_prefix + ).logits + logits_shared_prefix_last = logits_shared_prefix[ + 0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1], : + ] # last three tokens + decoded_shared_prefix = [self.tokenizer.decode(t) for t in logits_shared_prefix_last.argmax(dim=-1)] + + self.assertEqual(decoded, decoded_shared_prefix) + + def test_partial_stacked_causal_mask(self): + # Same as the test above, but the input is passed in two groups. It tests that we can pass partial 4D attention masks + + ( + input_ids, + position_ids, + input_ids_shared_prefix, + mask_shared_prefix, + position_ids_shared_prefix, + ) = self.get_test_data() + + # regular batch + logits = self.model.forward(input_ids, position_ids=position_ids).logits + logits_last = logits[:, -1, :] # last tokens in each batch line + decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)] + + # 2 forward runs with custom 4D masks + part_a = 3 # split point + + input_1a = input_ids_shared_prefix[:, :part_a] + position_ids_1a = position_ids_shared_prefix[:, :part_a] + mask_1a = mask_shared_prefix[:, :, :part_a, :part_a] + + outs_1a = self.model.forward(input_1a, attention_mask=mask_1a, position_ids=position_ids_1a) + past_key_values_a = outs_1a["past_key_values"] + + # Case 1: we pass a 4D attention mask regarding the current sequence length (i.e. [..., seq_len, full_len]) + input_1b = input_ids_shared_prefix[:, part_a:] + position_ids_1b = position_ids_shared_prefix[:, part_a:] + mask_1b = mask_shared_prefix[:, :, part_a:, :] + outs_1b = self.model.forward( + input_1b, + attention_mask=mask_1b, + position_ids=position_ids_1b, + past_key_values=past_key_values_a, + ) + decoded_1b = [ + self.tokenizer.decode(t) + for t in outs_1b.logits.argmax(-1)[ + 0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1] - part_a + ] + ] + self.assertEqual(decoded, decoded_1b) + + def test_stacked_causal_mask_static_cache(self): + """same as above but with StaticCache""" + ( + input_ids, + position_ids, + input_ids_shared_prefix, + mask_shared_prefix, + position_ids_shared_prefix, + ) = self.get_test_data() + + # regular batch + logits = self.model.forward(input_ids, position_ids=position_ids).logits + logits_last = logits[:, -1, :] # last tokens in each batch line + decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)] + + # upgrade the model with StaticCache + max_cache_len = 16 # note that max_cache_len is greater than the attention_mask.shape[-1] + past_key_values = StaticCache( + config=self.model.config, + max_batch_size=1, + max_cache_len=max_cache_len, + device=torch_device, + dtype=self.model.dtype, + ) + + padded_attention_mask = torch.nn.functional.pad( + input=mask_shared_prefix, + pad=(0, max_cache_len - mask_shared_prefix.shape[-1]), + mode="constant", + value=torch.finfo(self.model_dtype).min, + ) + + # single forward run with 4D custom mask + logits_shared_prefix = self.model.forward( + input_ids_shared_prefix, + attention_mask=padded_attention_mask, + position_ids=position_ids_shared_prefix, + cache_position=torch.arange(input_ids_shared_prefix.shape[-1], device=torch_device), + past_key_values=past_key_values, + ).logits + logits_shared_prefix_last = logits_shared_prefix[ + 0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1], : + ] # last three tokens + decoded_shared_prefix = [self.tokenizer.decode(t) for t in logits_shared_prefix_last.argmax(dim=-1)] + + self.assertEqual(decoded, decoded_shared_prefix) + + def test_partial_stacked_causal_mask_static_cache(self): + # Same as the test above, but the input is passed in two groups. It tests that we can pass partial 4D attention masks + # we pass a 4D attention mask shaped [..., seq_len, full_static_cache_len]) + ( + input_ids, + position_ids, + input_ids_shared_prefix, + mask_shared_prefix, + position_ids_shared_prefix, + ) = self.get_test_data() + + # regular batch + logits = self.model.forward(input_ids, position_ids=position_ids).logits + logits_last = logits[:, -1, :] # last tokens in each batch line + decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)] + + # upgrade the model with StaticCache + max_cache_len = 16 # note that max_cache_len is greater than the attention_mask.shape[-1] + past_key_values = StaticCache( + config=self.model.config, + max_batch_size=1, + max_cache_len=max_cache_len, + device=torch_device, + dtype=self.model.dtype, + ) + + # forward run for the first part of input + part_a = 3 # split point + + input_1a = input_ids_shared_prefix[:, :part_a] + position_ids_1a = position_ids_shared_prefix[:, :part_a] + mask_1a = mask_shared_prefix[:, :, :part_a, :part_a] + + padded_mask_1a = torch.nn.functional.pad( + input=mask_1a, + pad=(0, max_cache_len - mask_1a.shape[-1]), + mode="constant", + value=torch.finfo(self.model_dtype).min, + ) + + _ = self.model.forward( + input_1a, + attention_mask=padded_mask_1a, + position_ids=position_ids_1a, + cache_position=torch.arange(part_a, device=torch_device), + past_key_values=past_key_values, + ) + + # forward run for the second part of input + input_1b = input_ids_shared_prefix[:, part_a:] + position_ids_1b = position_ids_shared_prefix[:, part_a:] + mask_1b = mask_shared_prefix[:, :, part_a:, :] + + padded_mask_1b = torch.nn.functional.pad( + input=mask_1b, pad=(0, max_cache_len - mask_1b.shape[-1]), mode="constant", value=0 + ) + + outs_1b = self.model.forward( + input_1b, + attention_mask=padded_mask_1b, + position_ids=position_ids_1b, + cache_position=torch.arange( + part_a, + input_ids_shared_prefix.shape[-1], + device=torch_device, + ), + past_key_values=past_key_values, + ) + decoded_1b = [ + self.tokenizer.decode(t) + for t in outs_1b.logits.argmax(-1)[ + 0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1] - part_a + ] + ] + self.assertEqual(decoded, decoded_1b) diff --git a/docs/transformers/tests/models/dinat/__init__.py b/docs/transformers/tests/models/dinat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/dinat/test_modeling_dinat.py b/docs/transformers/tests/models/dinat/test_modeling_dinat.py new file mode 100644 index 0000000000000000000000000000000000000000..1de68988e7266d9bdbec906a1f54327d9b5bab22 --- /dev/null +++ b/docs/transformers/tests/models/dinat/test_modeling_dinat.py @@ -0,0 +1,378 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Dinat model.""" + +import collections +import unittest + +from transformers import DinatConfig +from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_backbone_common import BackboneTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import DinatBackbone, DinatForImageClassification, DinatModel + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class DinatModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=64, + patch_size=4, + num_channels=3, + embed_dim=16, + depths=[1, 2, 1], + num_heads=[2, 4, 8], + kernel_size=3, + dilations=[[3], [1, 2], [1]], + mlp_ratio=2.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + patch_norm=True, + initializer_range=0.02, + layer_norm_eps=1e-5, + is_training=True, + scope=None, + use_labels=True, + num_labels=10, + out_features=["stage1", "stage2"], + out_indices=[1, 2], + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_heads = num_heads + self.kernel_size = kernel_size + self.dilations = dilations + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.patch_norm = patch_norm + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.is_training = is_training + self.scope = scope + self.use_labels = use_labels + self.num_labels = num_labels + self.out_features = out_features + self.out_indices = out_indices + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DinatConfig( + num_labels=self.num_labels, + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + embed_dim=self.embed_dim, + depths=self.depths, + num_heads=self.num_heads, + kernel_size=self.kernel_size, + dilations=self.dilations, + mlp_ratio=self.mlp_ratio, + qkv_bias=self.qkv_bias, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + drop_path_rate=self.drop_path_rate, + hidden_act=self.hidden_act, + patch_norm=self.patch_norm, + layer_norm_eps=self.layer_norm_eps, + initializer_range=self.initializer_range, + out_features=self.out_features, + out_indices=self.out_indices, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DinatModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + expected_height = expected_width = (config.image_size // config.patch_size) // (2 ** (len(config.depths) - 1)) + expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1)) + + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, expected_height, expected_width, expected_dim) + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + model = DinatForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + # test greyscale images + config.num_channels = 1 + model = DinatForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_backbone(self, config, pixel_values, labels): + model = DinatBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify hidden states + self.parent.assertEqual(len(result.feature_maps), len(config.out_features)) + self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], 16, 16]) + + # verify channels + self.parent.assertEqual(len(model.channels), len(config.out_features)) + + # verify backbone works with out_features=None + config.out_features = None + model = DinatBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), 1) + self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[-1], 4, 4]) + + # verify channels + self.parent.assertEqual(len(model.channels), 1) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_natten +@require_torch +class DinatModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + DinatModel, + DinatForImageClassification, + DinatBackbone, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + {"image-feature-extraction": DinatModel, "image-classification": DinatForImageClassification} + if is_torch_available() + else {} + ) + fx_compatible = False + + test_torchscript = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = DinatModelTester(self) + self.config_tester = ConfigTester( + self, config_class=DinatConfig, embed_dim=37, common_properties=["patch_size", "num_channels"] + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + def test_backbone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_backbone(*config_and_inputs) + + @unittest.skip(reason="Dinat does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Dinat does not use feedforward chunking") + def test_feed_forward_chunking(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_attention_outputs(self): + self.skipTest(reason="Dinat's attention operation is handled entirely by NATTEN.") + + def check_hidden_states_output(self, inputs_dict, config, model_class, image_size): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # Dinat has a different seq_length + patch_size = ( + config.patch_size + if isinstance(config.patch_size, collections.abc.Iterable) + else (config.patch_size, config.patch_size) + ) + + height = image_size[0] // patch_size[0] + width = image_size[1] // patch_size[1] + + self.assertListEqual( + list(hidden_states[0].shape[-3:]), + [height, width, self.model_tester.embed_dim], + ) + + if model_class.__name__ != "DinatBackbone": + reshaped_hidden_states = outputs.reshaped_hidden_states + self.assertEqual(len(reshaped_hidden_states), expected_num_layers) + + batch_size, num_channels, height, width = reshaped_hidden_states[0].shape + reshaped_hidden_states = ( + reshaped_hidden_states[0].view(batch_size, num_channels, height, width).permute(0, 2, 3, 1) + ) + self.assertListEqual( + list(reshaped_hidden_states.shape[-3:]), + [height, width, self.model_tester.embed_dim], + ) + + def test_hidden_states_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + image_size = ( + self.model_tester.image_size + if isinstance(self.model_tester.image_size, collections.abc.Iterable) + else (self.model_tester.image_size, self.model_tester.image_size) + ) + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + @slow + def test_model_from_pretrained(self): + model_name = "shi-labs/dinat-mini-in1k-224" + model = DinatModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if "embeddings" not in name and param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + +@require_natten +@require_vision +@require_torch +class DinatModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("shi-labs/dinat-mini-in1k-224") if is_vision_available() else None + + @slow + def test_inference_image_classification_head(self): + model = DinatForImageClassification.from_pretrained("shi-labs/dinat-mini-in1k-224").to(torch_device) + image_processor = self.default_image_processor + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + expected_slice = torch.tensor([-0.1545, -0.7667, 0.4642]).to(torch_device) + torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + +@require_torch +@require_natten +class DinatBackboneTest(unittest.TestCase, BackboneTesterMixin): + all_model_classes = (DinatBackbone,) if is_torch_available() else () + config_class = DinatConfig + + def setUp(self): + self.model_tester = DinatModelTester(self) diff --git a/docs/transformers/tests/models/dinov2/__init__.py b/docs/transformers/tests/models/dinov2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/dinov2/test_modeling_dinov2.py b/docs/transformers/tests/models/dinov2/test_modeling_dinov2.py new file mode 100644 index 0000000000000000000000000000000000000000..49a70380d3a94d1c6c01500313147a62ef3feee3 --- /dev/null +++ b/docs/transformers/tests/models/dinov2/test_modeling_dinov2.py @@ -0,0 +1,345 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Dinov2 model.""" + +import unittest + +from transformers import Dinov2Config +from transformers.testing_utils import ( + is_flaky, + require_torch, + require_vision, + slow, + torch_device, +) +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_backbone_common import BackboneTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import Dinov2Backbone, Dinov2ForImageClassification, Dinov2Model + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class Dinov2ModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + type_sequence_label_size=10, + initializer_range=0.02, + scope=None, + attn_implementation="eager", + mask_ratio=0.5, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.scope = scope + self.attn_implementation = attn_implementation + self.mask_ratio = mask_ratio + + # in Dinov2, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + self.num_masks = int(self.mask_ratio * self.seq_length) + self.mask_length = num_patches + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return Dinov2Config( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + attn_implementation=self.attn_implementation, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = Dinov2Model(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_backbone(self, config, pixel_values, labels): + model = Dinov2Backbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify hidden states + self.parent.assertEqual(len(result.feature_maps), len(config.out_features)) + expected_size = self.image_size // config.patch_size + self.parent.assertListEqual( + list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], expected_size, expected_size] + ) + + # verify channels + self.parent.assertEqual(len(model.channels), len(config.out_features)) + + # verify backbone works with out_features=None + config.out_features = None + model = Dinov2Backbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), 1) + self.parent.assertListEqual( + list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], expected_size, expected_size] + ) + + # verify channels + self.parent.assertEqual(len(model.channels), 1) + + # verify backbone works with apply_layernorm=False and reshape_hidden_states=False + config.apply_layernorm = False + config.reshape_hidden_states = False + + model = Dinov2Backbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), 1) + self.parent.assertListEqual( + list(result.feature_maps[0].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = Dinov2ForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = Dinov2ForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + pixel_values, + labels, + ) = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class Dinov2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Dinov2 does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + test_torch_exportable = True + + all_model_classes = ( + ( + Dinov2Model, + Dinov2ForImageClassification, + Dinov2Backbone, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + {"image-feature-extraction": Dinov2Model, "image-classification": Dinov2ForImageClassification} + if is_torch_available() + else {} + ) + fx_compatible = True + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = Dinov2ModelTester(self) + self.config_tester = ConfigTester(self, config_class=Dinov2Config, has_text_modality=False, hidden_size=37) + + @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.") + def test_initialization(self): + super().test_initialization() + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Dinov2 does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_backbone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_backbone(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @unittest.skip(reason="Dinov2 does not support feedforward chunking yet") + def test_feed_forward_chunking(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/dinov2-base" + model = Dinov2Model.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class Dinov2ModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("facebook/dinov2-base") if is_vision_available() else None + + @slow + def test_inference_no_head(self): + model = Dinov2Model.from_pretrained("facebook/dinov2-base").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the last hidden states + expected_shape = torch.Size((1, 257, 768)) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-2.2005, -0.4495, 1.0964], [-3.3959, -0.8942, -1.0315], [-2.9355, 1.1564, -0.7656]], + device=torch_device, + ) + torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-3, atol=1e-3) + + +@require_torch +class Dinov2BackboneTest(unittest.TestCase, BackboneTesterMixin): + all_model_classes = (Dinov2Backbone,) if is_torch_available() else () + config_class = Dinov2Config + + has_attentions = False + + def setUp(self): + self.model_tester = Dinov2ModelTester(self) diff --git a/docs/transformers/tests/models/dinov2/test_modeling_flax_dinov2.py b/docs/transformers/tests/models/dinov2/test_modeling_flax_dinov2.py new file mode 100644 index 0000000000000000000000000000000000000000..161e49e3db948d22534adf7a9f8f544be2af746f --- /dev/null +++ b/docs/transformers/tests/models/dinov2/test_modeling_flax_dinov2.py @@ -0,0 +1,270 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the Flax Dinov2 model.""" + +import inspect +import unittest + +import numpy as np + +from transformers import Dinov2Config +from transformers.testing_utils import require_flax, require_vision, slow +from transformers.utils import cached_property, is_flax_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor + + +if is_flax_available(): + import jax + + from transformers.models.dinov2.modeling_flax_dinov2 import FlaxDinov2ForImageClassification, FlaxDinov2Model + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class FlaxDinov2ModelTester: + def __init__( + self, + parent, + batch_size=2, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + type_sequence_label_size=10, + initializer_range=0.02, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + + # in Dinov2, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + config = Dinov2Config( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + return config, pixel_values + + # Copied from transformers.models.vit.test_modeling_flax_vit.FlaxViTModelTester.prepare_config_and_inputs with ViT -> Dinov2 + def create_and_check_model(self, config, pixel_values): + model = FlaxDinov2Model(config=config) + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + + # Copied from transformers.models.vit.test_modeling_flax_vit.FlaxViTModelTester.create_and_check_for_image_classification with ViT -> Dinov2 + def create_and_check_for_image_classification(self, config, pixel_values): + config.num_labels = self.type_sequence_label_size + model = FlaxDinov2ForImageClassification(config=config) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = FlaxDinov2ForImageClassification(config) + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + + # Copied from transformers.models.vit.test_modeling_flax_vit.FlaxViTModelTester.prepare_config_and_inputs_for_common + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + pixel_values, + ) = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_flax +# Copied from transformers.models.vit.test_modeling_flax_vit.FlaxViTModelTest with google/vit-base-patch16-224 -> facebook/dinov2-base +class FlaxDionv2ModelTest(FlaxModelTesterMixin, unittest.TestCase): + all_model_classes = (FlaxDinov2Model, FlaxDinov2ForImageClassification) if is_flax_available() else () + + def setUp(self) -> None: + self.model_tester = FlaxDinov2ModelTester(self) + self.config_tester = ConfigTester(self, config_class=Dinov2Config, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + # We need to override this test because Dinov2's forward signature is different than text models. + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.__call__) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + # We need to override this test because Dinov2 expects pixel_values instead of input_ids + def test_jit_compilation(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + with self.subTest(model_class.__name__): + prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + + @jax.jit + def model_jitted(pixel_values, **kwargs): + return model(pixel_values=pixel_values, **kwargs) + + with self.subTest("JIT Enabled"): + jitted_outputs = model_jitted(**prepared_inputs_dict).to_tuple() + + with self.subTest("JIT Disabled"): + with jax.disable_jit(): + outputs = model_jitted(**prepared_inputs_dict).to_tuple() + + self.assertEqual(len(outputs), len(jitted_outputs)) + for jitted_output, output in zip(jitted_outputs, outputs): + self.assertEqual(jitted_output.shape, output.shape) + + @slow + def test_model_from_pretrained(self): + for model_class_name in self.all_model_classes: + model = model_class_name.from_pretrained("facebook/dinov2-base") + outputs = model(np.ones((1, 3, 224, 224))) + self.assertIsNotNone(outputs) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return [image, image] + + +@require_vision +@require_flax +class FlaxDinov2ModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("facebook/dinov2-base") if is_vision_available() else None + + @slow + def test_inference_no_head(self): + model = FlaxDinov2Model.from_pretrained("facebook/dinov2-base") + + image_processor = self.default_image_processor + image = prepare_img() + pixel_values = image_processor(images=image, return_tensors="np").pixel_values + + # forward pass + outputs = model(pixel_values=pixel_values) + + # verify the logits + expected_shape = (2, 257, 768) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + expected_slice = np.array( + [ + [ + [-2.1629121, -0.46566057, 1.0925977], + [-3.5971704, -1.0283585, -1.1780515], + [-2.900407, 1.1334689, -0.74357724], + ], + [ + [-2.1629121, -0.46566057, 1.0925977], + [-3.5971704, -1.0283585, -1.1780515], + [-2.900407, 1.1334689, -0.74357724], + ], + ] + ) + + self.assertTrue(np.allclose(outputs.last_hidden_state[:2, :3, :3], expected_slice, atol=1e-4)) + + @slow + def test_inference_image_classification_head_imagenet_1k(self): + model = FlaxDinov2ForImageClassification.from_pretrained( + "facebook/dinov2-base-imagenet1k-1-layer", from_pt=True + ) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="np") + + # forward pass + outputs = model(**inputs) + logits = outputs.logits + + # verify the logits + expected_shape = (2, 1000) + self.assertEqual(logits.shape, expected_shape) + + expected_slice = np.array([[-2.1776447, 0.36716992, 0.13870952], [-2.1776447, 0.36716992, 0.13870952]]) + + self.assertTrue(np.allclose(logits[:2, :3], expected_slice, atol=1e-3)) + + expected_class_idx = 281 + self.assertEqual(logits[0].argmax(-1).item(), expected_class_idx) + self.assertEqual(logits[1].argmax(-1).item(), expected_class_idx) diff --git a/docs/transformers/tests/models/dinov2_with_registers/__init__.py b/docs/transformers/tests/models/dinov2_with_registers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py b/docs/transformers/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py new file mode 100644 index 0000000000000000000000000000000000000000..74ab8d545f05eb8db52d5110b286a805c07b13ec --- /dev/null +++ b/docs/transformers/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py @@ -0,0 +1,369 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Dinov2WithRegisters model.""" + +import unittest + +from transformers import Dinov2WithRegistersConfig +from transformers.testing_utils import ( + require_torch, + require_vision, + slow, + torch_device, +) +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_backbone_common import BackboneTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ( + Dinov2WithRegistersBackbone, + Dinov2WithRegistersForImageClassification, + Dinov2WithRegistersModel, + ) + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class Dinov2WithRegistersModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + type_sequence_label_size=10, + initializer_range=0.02, + num_register_tokens=2, + mask_ratio=0.5, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_register_tokens = num_register_tokens + self.scope = scope + + # in DINOv2 with Registers, the seq length equals the number of patches + 1 + num_register_tokens (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + self.num_register_tokens + self.mask_ratio = mask_ratio + self.num_masks = int(mask_ratio * self.seq_length) + self.mask_length = num_patches + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return Dinov2WithRegistersConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + num_register_tokens=self.num_register_tokens, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = Dinov2WithRegistersModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_backbone(self, config, pixel_values, labels): + model = Dinov2WithRegistersBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify hidden states + self.parent.assertEqual(len(result.feature_maps), len(config.out_features)) + expected_size = self.image_size // config.patch_size + self.parent.assertListEqual( + list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], expected_size, expected_size] + ) + + # verify channels + self.parent.assertEqual(len(model.channels), len(config.out_features)) + + # verify backbone works with out_features=None + config.out_features = None + model = Dinov2WithRegistersBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), 1) + self.parent.assertListEqual( + list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], expected_size, expected_size] + ) + + # verify channels + self.parent.assertEqual(len(model.channels), 1) + + # verify backbone works with apply_layernorm=False and reshape_hidden_states=False + config.apply_layernorm = False + config.reshape_hidden_states = False + + model = Dinov2WithRegistersBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), 1) + self.parent.assertListEqual( + list(result.feature_maps[0].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = Dinov2WithRegistersForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = Dinov2WithRegistersForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + pixel_values, + labels, + ) = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class Dinov2WithRegistersModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Dinov2WithRegisters does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + ( + Dinov2WithRegistersModel, + Dinov2WithRegistersForImageClassification, + Dinov2WithRegistersBackbone, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "image-feature-extraction": Dinov2WithRegistersModel, + "image-classification": Dinov2WithRegistersForImageClassification, + } + if is_torch_available() + else {} + ) + fx_compatible = False + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = Dinov2WithRegistersModelTester(self) + self.config_tester = ConfigTester( + self, config_class=Dinov2WithRegistersConfig, has_text_modality=False, hidden_size=37 + ) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad and "register_tokens" not in name: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Dinov2WithRegisters does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_backbone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_backbone(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @unittest.skip(reason="Dinov2WithRegisters does not support feedforward chunking yet") + def test_feed_forward_chunking(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/dinov2-with-registers-base" + model = Dinov2WithRegistersModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class Dinov2WithRegistersModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ( + AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") + if is_vision_available() + else None + ) + + @slow + def test_inference_no_head(self): + model = Dinov2WithRegistersModel.from_pretrained("facebook/dinov2-with-registers-base").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the last hidden states + # in DINOv2 with Registers, the seq length equals the number of patches + 1 + num_register_tokens (we add 1 for the [CLS] token) + num_patches = (image_processor.crop_size["height"] // model.config.patch_size) ** 2 + expected_seq_length = num_patches + 1 + model.config.num_register_tokens + expected_shape = torch.Size((1, expected_seq_length, model.config.hidden_size)) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.4636, -1.4582, -0.0274], [-1.4738, -0.8858, 0.3002], [0.0714, -0.2407, -1.5940]], + device=torch_device, + ) + torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + +@require_torch +class Dinov2WithRegistersBackboneTest(unittest.TestCase, BackboneTesterMixin): + all_model_classes = (Dinov2WithRegistersBackbone,) if is_torch_available() else () + config_class = Dinov2WithRegistersConfig + + has_attentions = False + + def setUp(self): + self.model_tester = Dinov2WithRegistersModelTester(self) diff --git a/docs/transformers/tests/models/distilbert/__init__.py b/docs/transformers/tests/models/distilbert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/distilbert/test_modeling_distilbert.py b/docs/transformers/tests/models/distilbert/test_modeling_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..32f4ea349bb09a6bd31248510f8cb219a0b56374 --- /dev/null +++ b/docs/transformers/tests/models/distilbert/test_modeling_distilbert.py @@ -0,0 +1,465 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import tempfile +import unittest + +import pytest + +from transformers import DistilBertConfig, is_torch_available +from transformers.testing_utils import require_flash_attn, require_torch, require_torch_accelerator, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + AutoTokenizer, + DistilBertForMaskedLM, + DistilBertForMultipleChoice, + DistilBertForQuestionAnswering, + DistilBertForSequenceClassification, + DistilBertForTokenClassification, + DistilBertModel, + ) + from transformers.models.distilbert.modeling_distilbert import _create_sinusoidal_embeddings + from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4 + + +class DistilBertModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return DistilBertConfig( + vocab_size=self.vocab_size, + dim=self.hidden_size, + n_layers=self.num_hidden_layers, + n_heads=self.num_attention_heads, + hidden_dim=self.intermediate_size, + hidden_act=self.hidden_act, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + ) + + def create_and_check_distilbert_model( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DistilBertModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_distilbert_for_masked_lm( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DistilBertForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_distilbert_for_question_answering( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DistilBertForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_distilbert_for_sequence_classification( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = DistilBertForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_distilbert_for_token_classification( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = DistilBertForTokenClassification(config=config) + model.to(torch_device) + model.eval() + + result = model(input_ids, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_distilbert_for_multiple_choice( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = DistilBertForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class DistilBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + DistilBertModel, + DistilBertForMaskedLM, + DistilBertForMultipleChoice, + DistilBertForQuestionAnswering, + DistilBertForSequenceClassification, + DistilBertForTokenClassification, + ) + if is_torch_available() + else None + ) + pipeline_model_mapping = ( + { + "feature-extraction": DistilBertModel, + "fill-mask": DistilBertForMaskedLM, + "question-answering": DistilBertForQuestionAnswering, + "text-classification": DistilBertForSequenceClassification, + "token-classification": DistilBertForTokenClassification, + "zero-shot": DistilBertForSequenceClassification, + } + if is_torch_available() + else {} + ) + fx_compatible = True + test_pruning = True + test_resize_embeddings = True + test_resize_position_embeddings = True + + def setUp(self): + self.model_tester = DistilBertModelTester(self) + self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_distilbert_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_model(*config_and_inputs) + + def test_distilbert_model_with_sinusoidal_encodings(self): + config = DistilBertConfig(sinusoidal_pos_embds=True) + model = DistilBertModel(config=config) + sinusoidal_pos_embds = torch.empty((config.max_position_embeddings, config.dim), dtype=torch.float32) + _create_sinusoidal_embeddings(config.max_position_embeddings, config.dim, sinusoidal_pos_embds) + self.model_tester.parent.assertTrue( + torch.equal(model.embeddings.position_embeddings.weight, sinusoidal_pos_embds) + ) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "distilbert-base-uncased" + model = DistilBertModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @slow + @require_torch_accelerator + def test_torchscript_device_change(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + # BertForMultipleChoice behaves incorrectly in JIT environments. + if model_class == DistilBertForMultipleChoice: + self.skipTest(reason="DistilBertForMultipleChoice behaves incorrectly in JIT environments.") + + config.torchscript = True + model = model_class(config=config) + + inputs_dict = self._prepare_for_class(inputs_dict, model_class) + traced_model = torch.jit.trace( + model, (inputs_dict["input_ids"].to("cpu"), inputs_dict["attention_mask"].to("cpu")) + ) + + with tempfile.TemporaryDirectory() as tmp: + torch.jit.save(traced_model, os.path.join(tmp, "traced_model.pt")) + loaded = torch.jit.load(os.path.join(tmp, "traced_model.pt"), map_location=torch_device) + loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device)) + + # Because DistilBertForMultipleChoice requires inputs with different shapes we need to override this test. + @require_flash_attn + @require_torch_accelerator + @pytest.mark.flash_attn_test + @slow + def test_flash_attn_2_inference_equivalence(self): + import torch + + for model_class in self.all_model_classes: + dummy_input = torch.LongTensor( + [ + [1, 2, 3, 4], + [1, 2, 8, 9], + [1, 2, 11, 12], + [1, 2, 13, 14], + ] + ).to(torch_device) + dummy_attention_mask = torch.LongTensor( + [ + [0, 1, 1, 1], + [0, 1, 1, 1], + [0, 1, 1, 1], + [0, 1, 1, 1], + ] + ).to(torch_device) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16) + model.to(torch_device) + + logits = model(dummy_input, output_hidden_states=True).hidden_states[-1] + logits_fa = model_fa(dummy_input, output_hidden_states=True).hidden_states[-1] + + torch.testing.assert_close(logits_fa, logits, rtol=4e-2, atol=4e-2) + + output_fa = model_fa(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True) + logits_fa = output_fa.hidden_states[-1] + + output = model(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True) + logits = output.hidden_states[-1] + + torch.testing.assert_close(logits_fa[1:], logits[1:], rtol=4e-2, atol=4e-2) + + # Because DistilBertForMultipleChoice requires inputs with different shapes we need to override this test. + @require_flash_attn + @require_torch_accelerator + @pytest.mark.flash_attn_test + @slow + def test_flash_attn_2_inference_equivalence_right_padding(self): + import torch + + for model_class in self.all_model_classes: + dummy_input = torch.LongTensor( + [ + [1, 2, 3, 4], + [1, 2, 8, 9], + [1, 2, 11, 12], + [1, 2, 13, 14], + ] + ).to(torch_device) + dummy_attention_mask = torch.LongTensor( + [ + [0, 1, 1, 1], + [0, 1, 1, 1], + [0, 1, 1, 1], + [0, 1, 1, 1], + ] + ).to(torch_device) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.bfloat16, + ) + model.to(torch_device) + + logits = model(dummy_input, output_hidden_states=True).hidden_states[-1] + logits_fa = model_fa(dummy_input, output_hidden_states=True).hidden_states[-1] + + torch.testing.assert_close(logits_fa, logits, rtol=4e-2, atol=4e-2) + + output_fa = model_fa(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True) + logits_fa = output_fa.hidden_states[-1] + + output = model(dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True) + logits = output.hidden_states[-1] + + torch.testing.assert_close(logits_fa[:-1], logits[:-1], rtol=4e-2, atol=4e-2) + + +@require_torch +class DistilBertModelIntergrationTest(unittest.TestCase): + @slow + def test_inference_no_head_absolute_embedding(self): + model = DistilBertModel.from_pretrained("distilbert-base-uncased") + input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + with torch.no_grad(): + output = model(input_ids, attention_mask=attention_mask)[0] + expected_shape = torch.Size((1, 11, 768)) + self.assertEqual(output.shape, expected_shape) + expected_slice = torch.tensor( + [[[-0.1639, 0.3299, 0.1648], [-0.1746, 0.3289, 0.1710], [-0.1884, 0.3357, 0.1810]]] + ) + + torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4) + + @slow + def test_export(self): + if not is_torch_greater_or_equal_than_2_4: + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + distilbert_model = "distilbert-base-uncased" + device = "cpu" + attn_implementation = "sdpa" + max_length = 64 + + tokenizer = AutoTokenizer.from_pretrained(distilbert_model) + inputs = tokenizer( + f"Paris is the {tokenizer.mask_token} of France.", + return_tensors="pt", + padding="max_length", + max_length=max_length, + ) + + model = DistilBertForMaskedLM.from_pretrained( + distilbert_model, + device_map=device, + attn_implementation=attn_implementation, + ) + + logits = model(**inputs).logits + eager_predicted_mask = tokenizer.decode(logits[0, 4].topk(5).indices) + self.assertEqual( + eager_predicted_mask.split(), + ["capital", "birthplace", "northernmost", "centre", "southernmost"], + ) + + exported_program = torch.export.export( + model, + args=(inputs["input_ids"],), + kwargs={"attention_mask": inputs["attention_mask"]}, + strict=True, + ) + + result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"]) + exported_predicted_mask = tokenizer.decode(result.logits[0, 4].topk(5).indices) + self.assertEqual(eager_predicted_mask, exported_predicted_mask) diff --git a/docs/transformers/tests/models/distilbert/test_modeling_flax_distilbert.py b/docs/transformers/tests/models/distilbert/test_modeling_flax_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..50655771ed14a575afd9388bab48d1440a291c67 --- /dev/null +++ b/docs/transformers/tests/models/distilbert/test_modeling_flax_distilbert.py @@ -0,0 +1,152 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +from transformers import DistilBertConfig, is_flax_available +from transformers.testing_utils import require_flax, slow + +from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask + + +if is_flax_available(): + import jax.numpy as jnp + + from transformers.models.distilbert.modeling_flax_distilbert import ( + FlaxDistilBertForMaskedLM, + FlaxDistilBertForMultipleChoice, + FlaxDistilBertForQuestionAnswering, + FlaxDistilBertForSequenceClassification, + FlaxDistilBertForTokenClassification, + FlaxDistilBertModel, + ) + + +class FlaxDistilBertModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_attention_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_choices=4, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_attention_mask = use_attention_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_choices = num_choices + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + attention_mask = None + if self.use_attention_mask: + attention_mask = random_attention_mask([self.batch_size, self.seq_length]) + + config = DistilBertConfig( + vocab_size=self.vocab_size, + dim=self.hidden_size, + n_layers=self.num_hidden_layers, + n_heads=self.num_attention_heads, + hidden_dim=self.intermediate_size, + hidden_act=self.hidden_act, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + tie_weights_=True, + ) + + return config, input_ids, attention_mask + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + return config, inputs_dict + + +@require_flax +class FlaxDistilBertModelTest(FlaxModelTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + FlaxDistilBertModel, + FlaxDistilBertForMaskedLM, + FlaxDistilBertForMultipleChoice, + FlaxDistilBertForQuestionAnswering, + FlaxDistilBertForSequenceClassification, + FlaxDistilBertForTokenClassification, + FlaxDistilBertForQuestionAnswering, + ) + if is_flax_available() + else () + ) + + def setUp(self): + self.model_tester = FlaxDistilBertModelTester(self) + + @slow + def test_model_from_pretrained(self): + for model_class_name in self.all_model_classes: + model = model_class_name.from_pretrained("distilbert-base-uncased") + outputs = model(np.ones((1, 1))) + self.assertIsNotNone(outputs) + + +@require_flax +class FlaxDistilBertModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_no_head_absolute_embedding(self): + model = FlaxDistilBertModel.from_pretrained("distilbert-base-uncased") + input_ids = np.array([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + attention_mask = np.array([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + output = model(input_ids, attention_mask=attention_mask)[0] + expected_shape = (1, 11, 768) + self.assertEqual(output.shape, expected_shape) + expected_slice = np.array([[[-0.1639, 0.3299, 0.1648], [-0.1746, 0.3289, 0.1710], [-0.1884, 0.3357, 0.1810]]]) + + self.assertTrue(jnp.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) diff --git a/docs/transformers/tests/models/distilbert/test_modeling_tf_distilbert.py b/docs/transformers/tests/models/distilbert/test_modeling_tf_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..674acdad26e88d5f1d65477bb4383db48898a33d --- /dev/null +++ b/docs/transformers/tests/models/distilbert/test_modeling_tf_distilbert.py @@ -0,0 +1,259 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import unittest + +from transformers import DistilBertConfig, is_tf_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers.models.distilbert.modeling_tf_distilbert import ( + TFDistilBertForMaskedLM, + TFDistilBertForMultipleChoice, + TFDistilBertForQuestionAnswering, + TFDistilBertForSequenceClassification, + TFDistilBertForTokenClassification, + TFDistilBertModel, + ) + + +class TFDistilBertModelTester: + def __init__( + self, + parent, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_input_mask = True + self.use_token_type_ids = False + self.use_labels = True + self.vocab_size = 99 + self.hidden_size = 32 + self.num_hidden_layers = 2 + self.num_attention_heads = 4 + self.intermediate_size = 37 + self.hidden_act = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.scope = None + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = DistilBertConfig( + vocab_size=self.vocab_size, + dim=self.hidden_size, + n_layers=self.num_hidden_layers, + n_heads=self.num_attention_heads, + hidden_dim=self.intermediate_size, + hidden_act=self.hidden_act, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + ) + + return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + + def create_and_check_distilbert_model( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDistilBertModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask} + + result = model(inputs) + + inputs = [input_ids, input_mask] + + result = model(inputs) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_distilbert_for_masked_lm( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDistilBertForMaskedLM(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_distilbert_for_question_answering( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDistilBertForQuestionAnswering(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + } + result = model(inputs) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_distilbert_for_sequence_classification( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFDistilBertForSequenceClassification(config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_distilbert_for_multiple_choice( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = TFDistilBertForMultipleChoice(config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_distilbert_for_token_classification( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFDistilBertForTokenClassification(config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFDistilBertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + TFDistilBertModel, + TFDistilBertForMaskedLM, + TFDistilBertForQuestionAnswering, + TFDistilBertForSequenceClassification, + TFDistilBertForTokenClassification, + TFDistilBertForMultipleChoice, + ) + if is_tf_available() + else None + ) + pipeline_model_mapping = ( + { + "feature-extraction": TFDistilBertModel, + "fill-mask": TFDistilBertForMaskedLM, + "question-answering": TFDistilBertForQuestionAnswering, + "text-classification": TFDistilBertForSequenceClassification, + "token-classification": TFDistilBertForTokenClassification, + "zero-shot": TFDistilBertForSequenceClassification, + } + if is_tf_available() + else {} + ) + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFDistilBertModelTester(self) + self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_distilbert_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "distilbert/distilbert-base-cased" + model = TFDistilBertModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_tf +class TFDistilBertModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_masked_lm(self): + model = TFDistilBertModel.from_pretrained("distilbert-base-uncased") + input_ids = tf.constant([[0, 1, 2, 3, 4, 5]]) + output = model(input_ids)[0] + + expected_shape = [1, 6, 768] + self.assertEqual(output.shape, expected_shape) + + expected_slice = tf.constant( + [ + [ + [0.19261885, -0.13732955, 0.4119799], + [0.22150156, -0.07422661, 0.39037204], + [0.22756018, -0.0896414, 0.3701467], + ] + ] + ) + tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4) diff --git a/docs/transformers/tests/models/distilbert/test_tokenization_distilbert.py b/docs/transformers/tests/models/distilbert/test_tokenization_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..cf92b48a3d527637f3830bbad8a5400d30db34a0 --- /dev/null +++ b/docs/transformers/tests/models/distilbert/test_tokenization_distilbert.py @@ -0,0 +1,42 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from transformers import DistilBertTokenizer, DistilBertTokenizerFast +from transformers.testing_utils import require_tokenizers, slow + +from ..bert import test_tokenization_bert + + +@require_tokenizers +class DistilBertTokenizationTest(test_tokenization_bert.BertTokenizationTest): + tokenizer_class = DistilBertTokenizer + rust_tokenizer_class = DistilBertTokenizerFast + test_rust_tokenizer = True + from_pretrained_id = "distilbert/distilbert-base-uncased" + + @slow + def test_sequence_builders(self): + tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") + + text = tokenizer.encode("sequence builders", add_special_tokens=False) + text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [ + tokenizer.sep_token_id + ] diff --git a/docs/transformers/tests/models/dit/__init__.py b/docs/transformers/tests/models/dit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/dit/test_modeling_dit.py b/docs/transformers/tests/models/dit/test_modeling_dit.py new file mode 100644 index 0000000000000000000000000000000000000000..1e29cc0fdcca45d975bb5f4ae0660fe3c3549335 --- /dev/null +++ b/docs/transformers/tests/models/dit/test_modeling_dit.py @@ -0,0 +1,60 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + + +if is_torch_available(): + import torch + + from transformers import AutoModelForImageClassification + +if is_vision_available(): + from transformers import AutoImageProcessor + + +@require_torch +@require_vision +class DiTIntegrationTest(unittest.TestCase): + @slow + def test_for_image_classification(self): + image_processor = AutoImageProcessor.from_pretrained("microsoft/dit-base-finetuned-rvlcdip") + model = AutoModelForImageClassification.from_pretrained("microsoft/dit-base-finetuned-rvlcdip") + model.to(torch_device) + + from datasets import load_dataset + + dataset = load_dataset("nielsr/rvlcdip-demo") + + image = dataset["train"][0]["image"].convert("RGB") + + inputs = image_processor(image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + logits = outputs.logits + + expected_shape = torch.Size((1, 16)) + self.assertEqual(logits.shape, expected_shape) + + expected_slice = torch.tensor( + [-0.4158, -0.4092, -0.4347], + device=torch_device, + dtype=torch.float, + ) + torch.testing.assert_close(logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/donut/__init__.py b/docs/transformers/tests/models/donut/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/donut/test_image_processing_donut.py b/docs/transformers/tests/models/donut/test_image_processing_donut.py new file mode 100644 index 0000000000000000000000000000000000000000..29c3bff2a14b56b7186ce8c00d2f125051102f36 --- /dev/null +++ b/docs/transformers/tests/models/donut/test_image_processing_donut.py @@ -0,0 +1,265 @@ +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers.testing_utils import is_flaky, require_torch, require_vision +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import DonutImageProcessor + + if is_torchvision_available(): + from transformers import DonutImageProcessorFast + + +class DonutImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_thumbnail=True, + do_align_axis=False, + do_pad=True, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size if size is not None else {"height": 18, "width": 20} + self.do_thumbnail = do_thumbnail + self.do_align_axis = do_align_axis + self.do_pad = do_pad + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_thumbnail": self.do_thumbnail, + "do_align_long_axis": self.do_align_axis, + "do_pad": self.do_pad, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class DonutImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = DonutImageProcessor if is_vision_available() else None + fast_image_processing_class = DonutImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = DonutImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_thumbnail")) + self.assertTrue(hasattr(image_processing, "do_align_long_axis")) + self.assertTrue(hasattr(image_processing, "do_pad")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 20}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + # Previous config had dimensions in (width, height) order + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=(42, 84)) + self.assertEqual(image_processor.size, {"height": 84, "width": 42}) + + def test_image_processor_preprocess_with_kwargs(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + + height = 84 + width = 42 + # Previous config had dimensions in (width, height) order + encoded_images = image_processing(image_inputs[0], size=(width, height), return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + 1, + self.image_processor_tester.num_channels, + height, + width, + ), + ) + + @is_flaky() + def test_call_pil(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + 1, + self.image_processor_tester.num_channels, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + + @is_flaky() + def test_call_numpy(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + 1, + self.image_processor_tester.num_channels, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + + @is_flaky() + def test_call_pytorch(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + 1, + self.image_processor_tester.num_channels, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + encoded_images.shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + + +@require_torch +@require_vision +class DonutImageProcessingAlignAxisTest(DonutImageProcessingTest): + def setUp(self): + super().setUp() + self.image_processor_tester = DonutImageProcessingTester(self, do_align_axis=True) diff --git a/docs/transformers/tests/models/donut/test_modeling_donut_swin.py b/docs/transformers/tests/models/donut/test_modeling_donut_swin.py new file mode 100644 index 0000000000000000000000000000000000000000..078331389b8c9ffa003308fcf22a8946ad985e83 --- /dev/null +++ b/docs/transformers/tests/models/donut/test_modeling_donut_swin.py @@ -0,0 +1,374 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Donut Swin model.""" + +import collections +import unittest + +from transformers import DonutSwinConfig +from transformers.testing_utils import require_torch, slow, torch_device +from transformers.utils import is_torch_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import DonutSwinForImageClassification, DonutSwinModel + + +class DonutSwinModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + patch_size=2, + num_channels=3, + embed_dim=16, + depths=[1, 2, 1], + num_heads=[2, 2, 4], + window_size=2, + mlp_ratio=2.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + use_absolute_embeddings=False, + patch_norm=True, + initializer_range=0.02, + layer_norm_eps=1e-5, + is_training=True, + scope=None, + use_labels=True, + type_sequence_label_size=10, + encoder_stride=8, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_heads = num_heads + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.use_absolute_embeddings = use_absolute_embeddings + self.patch_norm = patch_norm + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.is_training = is_training + self.scope = scope + self.use_labels = use_labels + self.type_sequence_label_size = type_sequence_label_size + self.encoder_stride = encoder_stride + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DonutSwinConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + embed_dim=self.embed_dim, + depths=self.depths, + num_heads=self.num_heads, + window_size=self.window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=self.qkv_bias, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + drop_path_rate=self.drop_path_rate, + hidden_act=self.hidden_act, + use_absolute_embeddings=self.use_absolute_embeddings, + path_norm=self.patch_norm, + layer_norm_eps=self.layer_norm_eps, + initializer_range=self.initializer_range, + encoder_stride=self.encoder_stride, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DonutSwinModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1)) + expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1)) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = DonutSwinForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = DonutSwinForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + pixel_values, + labels, + ) = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DonutSwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (DonutSwinModel, DonutSwinForImageClassification) if is_torch_available() else () + pipeline_model_mapping = ( + {"image-feature-extraction": DonutSwinModel, "image-classification": DonutSwinForImageClassification} + if is_torch_available() + else {} + ) + fx_compatible = True + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = DonutSwinModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=DonutSwinConfig, + has_text_modality=False, + embed_dim=37, + common_properties=["image_size", "patch_size", "num_channels"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @unittest.skip(reason="DonutSwin does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + expected_num_attentions = len(self.model_tester.depths) + self.assertEqual(len(attentions), expected_num_attentions) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + window_size_squared = config.window_size**2 + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + self.assertEqual(len(attentions), expected_num_attentions) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_heads[0], window_size_squared, window_size_squared], + ) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if hasattr(self.model_tester, "num_hidden_states_types"): + added_hidden_states = self.model_tester.num_hidden_states_types + else: + # also another +1 for reshaped_hidden_states + added_hidden_states = 2 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), expected_num_attentions) + + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_heads[0], window_size_squared, window_size_squared], + ) + + def check_hidden_states_output(self, inputs_dict, config, model_class, image_size): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # DonutSwin has a different seq_length + patch_size = ( + config.patch_size + if isinstance(config.patch_size, collections.abc.Iterable) + else (config.patch_size, config.patch_size) + ) + + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [num_patches, self.model_tester.embed_dim], + ) + + reshaped_hidden_states = outputs.reshaped_hidden_states + self.assertEqual(len(reshaped_hidden_states), expected_num_layers) + + batch_size, num_channels, height, width = reshaped_hidden_states[0].shape + reshaped_hidden_states = ( + reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1) + ) + self.assertListEqual( + list(reshaped_hidden_states.shape[-2:]), + [num_patches, self.model_tester.embed_dim], + ) + + def test_hidden_states_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + image_size = ( + self.model_tester.image_size + if isinstance(self.model_tester.image_size, collections.abc.Iterable) + else (self.model_tester.image_size, self.model_tester.image_size) + ) + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + def test_hidden_states_output_with_padding(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.patch_size = 3 + + image_size = ( + self.model_tester.image_size + if isinstance(self.model_tester.image_size, collections.abc.Iterable) + else (self.model_tester.image_size, self.model_tester.image_size) + ) + patch_size = ( + config.patch_size + if isinstance(config.patch_size, collections.abc.Iterable) + else (config.patch_size, config.patch_size) + ) + + padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0]) + padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1]) + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width)) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width)) + + @slow + def test_model_from_pretrained(self): + model_name = "naver-clova-ix/donut-base" + model = DonutSwinModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if "embeddings" not in name and param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) diff --git a/docs/transformers/tests/models/donut/test_processor_donut.py b/docs/transformers/tests/models/donut/test_processor_donut.py new file mode 100644 index 0000000000000000000000000000000000000000..272f1fd823414885ecf1e3fdfb7a8f63c476a08b --- /dev/null +++ b/docs/transformers/tests/models/donut/test_processor_donut.py @@ -0,0 +1,63 @@ +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import tempfile +import unittest + +from transformers import DonutImageProcessor, DonutProcessor, XLMRobertaTokenizerFast + +from ...test_processing_common import ProcessorTesterMixin + + +class DonutProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "naver-clova-ix/donut-base" + processor_class = DonutProcessor + + @classmethod + def setUpClass(cls): + cls.processor = DonutProcessor.from_pretrained(cls.from_pretrained_id) + cls.tmpdirname = tempfile.mkdtemp() + + image_processor = DonutImageProcessor() + tokenizer = XLMRobertaTokenizerFast.from_pretrained(cls.from_pretrained_id) + + processor = DonutProcessor(image_processor, tokenizer) + + processor.save_pretrained(cls.tmpdirname) + + def test_token2json(self): + expected_json = { + "name": "John Doe", + "age": "99", + "city": "Atlanta", + "state": "GA", + "zip": "30301", + "phone": "123-4567", + "nicknames": [{"nickname": "Johnny"}, {"nickname": "JD"}], + "multiline": "text\nwith\nnewlines", + "empty": "", + } + + sequence = ( + "John Doe99Atlanta" + "GA30301123-4567" + "Johnny" + "JD" + "text\nwith\nnewlines" + "" + ) + actual_json = self.processor.token2json(sequence) + + self.assertDictEqual(actual_json, expected_json) diff --git a/docs/transformers/tests/models/dpr/__init__.py b/docs/transformers/tests/models/dpr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/dpr/test_modeling_dpr.py b/docs/transformers/tests/models/dpr/test_modeling_dpr.py new file mode 100644 index 0000000000000000000000000000000000000000..f7c4f6eb183e0bc70a881906a2588d2b57b58a8a --- /dev/null +++ b/docs/transformers/tests/models/dpr/test_modeling_dpr.py @@ -0,0 +1,306 @@ +# Copyright 2020 Huggingface +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import tempfile +import unittest + +from transformers import DPRConfig, is_torch_available +from transformers.testing_utils import require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import DPRContextEncoder, DPRQuestionEncoder, DPRReader, DPRReaderTokenizer + + +class DPRModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=False, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + projection_dim=0, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.projection_dim = projection_dim + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return DPRConfig( + projection_dim=self.projection_dim, + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + ) + + def create_and_check_context_encoder( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DPRContextEncoder(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size)) + + def create_and_check_question_encoder( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DPRQuestionEncoder(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size)) + + def create_and_check_reader( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DPRReader(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + ) + + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.relevance_logits.shape, (self.batch_size,)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids} + return config, inputs_dict + + +@require_torch +class DPRModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + DPRContextEncoder, + DPRQuestionEncoder, + DPRReader, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = {"feature-extraction": DPRQuestionEncoder} if is_torch_available() else {} + + test_resize_embeddings = False + test_missing_keys = False # why? + test_pruning = False + test_head_masking = False + + def setUp(self): + self.model_tester = DPRModelTester(self) + self.config_tester = ConfigTester(self, config_class=DPRConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_context_encoder_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_context_encoder(*config_and_inputs) + + def test_question_encoder_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_question_encoder(*config_and_inputs) + + def test_reader_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_reader(*config_and_inputs) + + def test_init_changed_config(self): + config = self.model_tester.prepare_config_and_inputs()[0] + + model = DPRQuestionEncoder(config=config) + model.to(torch_device) + model.eval() + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + model = DPRQuestionEncoder.from_pretrained(tmp_dirname, projection_dim=512) + + self.assertIsNotNone(model) + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/dpr-ctx_encoder-single-nq-base" + model = DPRContextEncoder.from_pretrained(model_name) + self.assertIsNotNone(model) + + model_name = "facebook/dpr-ctx_encoder-single-nq-base" + model = DPRContextEncoder.from_pretrained(model_name) + self.assertIsNotNone(model) + + model_name = "facebook/dpr-ctx_encoder-single-nq-base" + model = DPRQuestionEncoder.from_pretrained(model_name) + self.assertIsNotNone(model) + + model_name = "facebook/dpr-ctx_encoder-single-nq-base" + model = DPRReader.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_torch +class DPRModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_no_head(self): + model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base", return_dict=False) + model.to(torch_device) + + input_ids = torch.tensor( + [[101, 7592, 1010, 2003, 2026, 3899, 10140, 1029, 102]], dtype=torch.long, device=torch_device + ) # [CLS] hello, is my dog cute? [SEP] + output = model(input_ids)[0] # embedding shape = (1, 768) + # compare the actual values for a slice. + expected_slice = torch.tensor( + [ + [ + 0.03236253, + 0.12753335, + 0.16818509, + 0.00279786, + 0.3896933, + 0.24264945, + 0.2178971, + -0.02335227, + -0.08481959, + -0.14324117, + ] + ], + dtype=torch.float, + device=torch_device, + ) + torch.testing.assert_close(output[:, :10], expected_slice, rtol=1e-4, atol=1e-4) + + @slow + def test_reader_inference(self): + tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base") + model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base") + model.to(torch_device) + + encoded_inputs = tokenizer( + questions="What is love ?", + titles="Haddaway", + texts="What Is Love is a song recorded by the artist Haddaway", + padding=True, + return_tensors="pt", + ) + encoded_inputs.to(torch_device) + + outputs = model(**encoded_inputs) + + # compare the actual values for a slice. + expected_start_logits = torch.tensor( + [[-10.3005, -10.7765, -11.4872, -11.6841, -11.9312, -10.3002, -9.8544, -11.7378, -12.0821, -10.2975]], + dtype=torch.float, + device=torch_device, + ) + + expected_end_logits = torch.tensor( + [[-11.0684, -11.7041, -11.5397, -10.3465, -10.8791, -6.8443, -11.9959, -11.0364, -10.0096, -6.8405]], + dtype=torch.float, + device=torch_device, + ) + torch.testing.assert_close(outputs.start_logits[:, :10], expected_start_logits, rtol=1e-4, atol=1e-4) + torch.testing.assert_close(outputs.end_logits[:, :10], expected_end_logits, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/dpr/test_modeling_tf_dpr.py b/docs/transformers/tests/models/dpr/test_modeling_tf_dpr.py new file mode 100644 index 0000000000000000000000000000000000000000..81427f3d945a7091cd6ba52cd51e3bc896ce09c0 --- /dev/null +++ b/docs/transformers/tests/models/dpr/test_modeling_tf_dpr.py @@ -0,0 +1,256 @@ +# Copyright 2020 Huggingface +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +from transformers import is_tf_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import numpy + import tensorflow as tf + + from transformers import ( + BertConfig, + DPRConfig, + TFDPRContextEncoder, + TFDPRQuestionEncoder, + TFDPRReader, + ) + + +class TFDPRModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + projection_dim=0, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.projection_dim = projection_dim + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + # follow test_modeling_tf_ctrl.py + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = BertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + ) + config = DPRConfig(projection_dim=self.projection_dim, **config.to_dict()) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def create_and_check_dpr_context_encoder( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDPRContextEncoder(config=config) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size)) + + def create_and_check_dpr_question_encoder( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDPRQuestionEncoder(config=config) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size)) + + def create_and_check_dpr_reader( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDPRReader(config=config) + result = model(input_ids, attention_mask=input_mask) + + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.relevance_logits.shape, (self.batch_size,)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids} + return config, inputs_dict + + +@require_tf +class TFDPRModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + TFDPRContextEncoder, + TFDPRQuestionEncoder, + TFDPRReader, + ) + if is_tf_available() + else () + ) + pipeline_model_mapping = {"feature-extraction": TFDPRQuestionEncoder} if is_tf_available() else {} + + test_resize_embeddings = False + test_missing_keys = False + test_pruning = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFDPRModelTester(self) + self.config_tester = ConfigTester(self, config_class=DPRConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_dpr_context_encoder_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_dpr_context_encoder(*config_and_inputs) + + def test_dpr_question_encoder_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_dpr_question_encoder(*config_and_inputs) + + def test_dpr_reader_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_dpr_reader(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/dpr-ctx_encoder-single-nq-base" + model = TFDPRContextEncoder.from_pretrained(model_name) + self.assertIsNotNone(model) + + model_name = "facebook/dpr-ctx_encoder-single-nq-base" + model = TFDPRContextEncoder.from_pretrained(model_name) + self.assertIsNotNone(model) + + model_name = "facebook/dpr-ctx_encoder-single-nq-base" + model = TFDPRQuestionEncoder.from_pretrained(model_name) + self.assertIsNotNone(model) + + model_name = "facebook/dpr-ctx_encoder-single-nq-base" + model = TFDPRReader.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_tf +class TFDPRModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_no_head(self): + model = TFDPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base") + + input_ids = tf.constant( + [[101, 7592, 1010, 2003, 2026, 3899, 10140, 1029, 102]] + ) # [CLS] hello, is my dog cute? [SEP] + output = model(input_ids)[0] # embedding shape = (1, 768) + # compare the actual values for a slice. + expected_slice = tf.constant( + [ + [ + 0.03236253, + 0.12753335, + 0.16818509, + 0.00279786, + 0.3896933, + 0.24264945, + 0.2178971, + -0.02335227, + -0.08481959, + -0.14324117, + ] + ] + ) + self.assertTrue(numpy.allclose(output[:, :10].numpy(), expected_slice.numpy(), atol=1e-4)) diff --git a/docs/transformers/tests/models/dpr/test_tokenization_dpr.py b/docs/transformers/tests/models/dpr/test_tokenization_dpr.py new file mode 100644 index 0000000000000000000000000000000000000000..ab1401b2951959395d561eccc63df550c3cc845e --- /dev/null +++ b/docs/transformers/tests/models/dpr/test_tokenization_dpr.py @@ -0,0 +1,87 @@ +# Copyright 2020 Huggingface +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from transformers import ( + DPRContextEncoderTokenizer, + DPRContextEncoderTokenizerFast, + DPRQuestionEncoderTokenizer, + DPRQuestionEncoderTokenizerFast, + DPRReaderOutput, + DPRReaderTokenizer, + DPRReaderTokenizerFast, +) +from transformers.testing_utils import require_tokenizers, slow +from transformers.tokenization_utils_base import BatchEncoding + +from ..bert import test_tokenization_bert + + +@require_tokenizers +class DPRContextEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest): + tokenizer_class = DPRContextEncoderTokenizer + rust_tokenizer_class = DPRContextEncoderTokenizerFast + test_rust_tokenizer = True + from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base" + + +@require_tokenizers +class DPRQuestionEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest): + tokenizer_class = DPRQuestionEncoderTokenizer + rust_tokenizer_class = DPRQuestionEncoderTokenizerFast + test_rust_tokenizer = True + from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base" + + +@require_tokenizers +class DPRReaderTokenizationTest(test_tokenization_bert.BertTokenizationTest): + tokenizer_class = DPRReaderTokenizer + rust_tokenizer_class = DPRReaderTokenizerFast + test_rust_tokenizer = True + from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base" + + @slow + def test_decode_best_spans(self): + tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased") + + text_1 = tokenizer.encode("question sequence", add_special_tokens=False) + text_2 = tokenizer.encode("title sequence", add_special_tokens=False) + text_3 = tokenizer.encode("text sequence " * 4, add_special_tokens=False) + input_ids = [[101] + text_1 + [102] + text_2 + [102] + text_3] + reader_input = BatchEncoding({"input_ids": input_ids}) + + start_logits = [[0] * len(input_ids[0])] + end_logits = [[0] * len(input_ids[0])] + relevance_logits = [0] + reader_output = DPRReaderOutput(start_logits, end_logits, relevance_logits) + + start_index, end_index = 8, 9 + start_logits[0][start_index] = 10 + end_logits[0][end_index] = 10 + predicted_spans = tokenizer.decode_best_spans(reader_input, reader_output) + self.assertEqual(predicted_spans[0].start_index, start_index) + self.assertEqual(predicted_spans[0].end_index, end_index) + self.assertEqual(predicted_spans[0].doc_id, 0) + + @slow + def test_call(self): + tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased") + + text_1 = tokenizer.encode("question sequence", add_special_tokens=False) + text_2 = tokenizer.encode("title sequence", add_special_tokens=False) + text_3 = tokenizer.encode("text sequence", add_special_tokens=False) + expected_input_ids = [101] + text_1 + [102] + text_2 + [102] + text_3 + encoded_input = tokenizer(questions=["question sequence"], titles=["title sequence"], texts=["text sequence"]) + self.assertIn("input_ids", encoded_input) + self.assertIn("attention_mask", encoded_input) + self.assertListEqual(encoded_input["input_ids"][0], expected_input_ids) diff --git a/docs/transformers/tests/models/dpt/__init__.py b/docs/transformers/tests/models/dpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/dpt/test_image_processing_dpt.py b/docs/transformers/tests/models/dpt/test_image_processing_dpt.py new file mode 100644 index 0000000000000000000000000000000000000000..8d5e8ea75ef10f8d24797506b9db0bb4b28b87da --- /dev/null +++ b/docs/transformers/tests/models/dpt/test_image_processing_dpt.py @@ -0,0 +1,293 @@ +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np +from datasets import load_dataset + +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import DPTImageProcessor + + +class DPTImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_reduce_labels=False, + ): + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_reduce_labels = do_reduce_labels + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + "do_reduce_labels": self.do_reduce_labels, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +# Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_single_inputs +def prepare_semantic_single_inputs(): + dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) + + image = Image.open(dataset[0]["file"]) + map = Image.open(dataset[1]["file"]) + + return image, map + + +# Copied from transformers.tests.models.beit.test_image_processing_beit.prepare_semantic_batch_inputs +def prepare_semantic_batch_inputs(): + ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) + + image1 = Image.open(ds[0]["file"]) + map1 = Image.open(ds[1]["file"]) + image2 = Image.open(ds[2]["file"]) + map2 = Image.open(ds[3]["file"]) + + return [image1, image2], [map1, map2] + + +@require_torch +@require_vision +class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = DPTImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = DPTImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "do_pad")) + self.assertTrue(hasattr(image_processing, "size_divisor")) + self.assertTrue(hasattr(image_processing, "do_reduce_labels")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + def test_padding(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + image = np.random.randn(3, 249, 491) + + # test individual method + image = image_processing.pad_image(image, size_divisor=4) + self.assertTrue(image.shape[1] % 4 == 0) + self.assertTrue(image.shape[2] % 4 == 0) + + # test by calling + pixel_values = image_processing.preprocess( + image, do_rescale=False, do_resize=False, do_pad=True, size_divisor=4, return_tensors="pt" + ).pixel_values + self.assertTrue(pixel_values.shape[2] % 4 == 0) + self.assertTrue(pixel_values.shape[3] % 4 == 0) + + def test_keep_aspect_ratio(self): + size = {"height": 512, "width": 512} + image_processor = DPTImageProcessor(size=size, keep_aspect_ratio=True, ensure_multiple_of=32) + + image = np.zeros((489, 640, 3)) + + pixel_values = image_processor(image, return_tensors="pt").pixel_values + + self.assertEqual(list(pixel_values.shape), [1, 3, 512, 672]) + + # Copied from transformers.tests.models.beit.test_image_processing_beit.BeitImageProcessingTest.test_call_segmentation_maps + def test_call_segmentation_maps(self): + # Initialize image_processor + image_processor = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + maps = [] + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + maps.append(torch.zeros(image.shape[-2:]).long()) + + # Test not batched input + encoding = image_processor(image_inputs[0], maps[0], return_tensors="pt") + self.assertEqual( + encoding["pixel_values"].shape, + ( + 1, + self.image_processor_tester.num_channels, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + self.assertEqual( + encoding["labels"].shape, + ( + 1, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + self.assertEqual(encoding["labels"].dtype, torch.long) + self.assertTrue(encoding["labels"].min().item() >= 0) + self.assertTrue(encoding["labels"].max().item() <= 255) + + # Test batched + encoding = image_processor(image_inputs, maps, return_tensors="pt") + self.assertEqual( + encoding["pixel_values"].shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + self.assertEqual( + encoding["labels"].shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + self.assertEqual(encoding["labels"].dtype, torch.long) + self.assertTrue(encoding["labels"].min().item() >= 0) + self.assertTrue(encoding["labels"].max().item() <= 255) + + # Test not batched input (PIL images) + image, segmentation_map = prepare_semantic_single_inputs() + + encoding = image_processor(image, segmentation_map, return_tensors="pt") + self.assertEqual( + encoding["pixel_values"].shape, + ( + 1, + self.image_processor_tester.num_channels, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + self.assertEqual( + encoding["labels"].shape, + ( + 1, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + self.assertEqual(encoding["labels"].dtype, torch.long) + self.assertTrue(encoding["labels"].min().item() >= 0) + self.assertTrue(encoding["labels"].max().item() <= 255) + + # Test batched input (PIL images) + images, segmentation_maps = prepare_semantic_batch_inputs() + + encoding = image_processor(images, segmentation_maps, return_tensors="pt") + self.assertEqual( + encoding["pixel_values"].shape, + ( + 2, + self.image_processor_tester.num_channels, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + self.assertEqual( + encoding["labels"].shape, + ( + 2, + self.image_processor_tester.size["height"], + self.image_processor_tester.size["width"], + ), + ) + self.assertEqual(encoding["labels"].dtype, torch.long) + self.assertTrue(encoding["labels"].min().item() >= 0) + self.assertTrue(encoding["labels"].max().item() <= 255) + + # Copied from transformers.tests.models.beit.test_image_processing_beit.BeitImageProcessingTest.test_reduce_labels + def test_reduce_labels(self): + # Initialize image_processor + image_processor = self.image_processing_class(**self.image_processor_dict) + + # ADE20k has 150 classes, and the background is included, so labels should be between 0 and 150 + image, map = prepare_semantic_single_inputs() + encoding = image_processor(image, map, return_tensors="pt") + self.assertTrue(encoding["labels"].min().item() >= 0) + self.assertTrue(encoding["labels"].max().item() <= 150) + + image_processor.do_reduce_labels = True + encoding = image_processor(image, map, return_tensors="pt") + self.assertTrue(encoding["labels"].min().item() >= 0) + self.assertTrue(encoding["labels"].max().item() <= 255) diff --git a/docs/transformers/tests/models/dpt/test_modeling_dpt.py b/docs/transformers/tests/models/dpt/test_modeling_dpt.py new file mode 100644 index 0000000000000000000000000000000000000000..248b40121a5fd66a062fac1a612b185a6dc512de --- /dev/null +++ b/docs/transformers/tests/models/dpt/test_modeling_dpt.py @@ -0,0 +1,438 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DPT model.""" + +import unittest + +from transformers import DPTConfig +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4 +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import DPTForDepthEstimation, DPTForSemanticSegmentation, DPTModel + from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES + + +if is_vision_available(): + from PIL import Image + + from transformers import DPTImageProcessor + + +class DPTModelTester: + def __init__( + self, + parent, + batch_size=2, + image_size=32, + patch_size=16, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + backbone_out_indices=[0, 1, 2, 3], + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + num_labels=3, + neck_hidden_sizes=[16, 32], + is_hybrid=False, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.backbone_out_indices = backbone_out_indices + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.num_labels = num_labels + self.scope = scope + self.is_hybrid = is_hybrid + self.neck_hidden_sizes = neck_hidden_sizes + # sequence length of DPT = num_patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DPTConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + fusion_hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + backbone_out_indices=self.backbone_out_indices, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + is_hybrid=self.is_hybrid, + neck_hidden_sizes=self.neck_hidden_sizes, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DPTModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_depth_estimation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DPTForDepthEstimation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + + def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DPTForSemanticSegmentation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_labels, self.image_size, self.image_size) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as DPT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (DPTModel, DPTForDepthEstimation, DPTForSemanticSegmentation) if is_torch_available() else () + pipeline_model_mapping = ( + { + "depth-estimation": DPTForDepthEstimation, + "image-feature-extraction": DPTModel, + "image-segmentation": DPTForSemanticSegmentation, + } + if is_torch_available() + else {} + ) + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = DPTModelTester(self) + self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="DPT does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_depth_estimation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + + def test_for_semantic_segmentation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs) + + def test_training(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DPTForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values(): + continue + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DPTForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing: + continue + model = model_class(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="Inductor error for dynamic shape") + def test_sdpa_can_compile_dynamic(self): + pass + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + # Skip the check for the backbone + backbone_params = [] + for name, module in model.named_modules(): + if module.__class__.__name__ == "DPTViTHybridEmbeddings": + backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()] + break + + for name, param in model.named_parameters(): + if param.requires_grad: + if name in backbone_params: + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_backbone_selection(self): + def _validate_backbone_init(): + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + if model.__class__.__name__ == "DPTForDepthEstimation": + # Confirm out_indices propagated to backbone + self.assertEqual(len(model.backbone.out_indices), 2) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_pretrained_backbone = True + config.backbone_config = None + config.backbone_kwargs = {"out_indices": [-2, -1]} + # Force load_backbone path + config.is_hybrid = False + + # Load a timm backbone + config.backbone = "resnet18" + config.use_timm_backbone = True + _validate_backbone_init() + + # Load a HF backbone + config.backbone = "facebook/dinov2-small" + config.use_timm_backbone = False + _validate_backbone_init() + + @slow + def test_model_from_pretrained(self): + model_name = "Intel/dpt-large" + model = DPTModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +@slow +class DPTModelIntegrationTest(unittest.TestCase): + def test_inference_depth_estimation(self): + image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large") + model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size((1, 384, 384)) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]] + ).to(torch_device) + + torch.testing.assert_close(outputs.predicted_depth[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + def test_inference_semantic_segmentation(self): + image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade") + model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 150, 480, 480)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor( + [[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]] + ).to(torch_device) + + torch.testing.assert_close(outputs.logits[0, 0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + def test_post_processing_semantic_segmentation(self): + image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade") + model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + outputs.logits = outputs.logits.detach().cpu() + + segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)]) + expected_shape = torch.Size((500, 300)) + self.assertEqual(segmentation[0].shape, expected_shape) + + segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs) + expected_shape = torch.Size((480, 480)) + self.assertEqual(segmentation[0].shape, expected_shape) + + def test_post_processing_depth_estimation(self): + image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large") + model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt") + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"] + expected_shape = torch.Size((384, 384)) + self.assertTrue(predicted_depth.shape == expected_shape) + + predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)]) + predicted_depth_l = predicted_depth_l[0]["predicted_depth"] + expected_shape = torch.Size((500, 500)) + self.assertTrue(predicted_depth_l.shape == expected_shape) + + output_enlarged = torch.nn.functional.interpolate( + predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False + ).squeeze() + self.assertTrue(output_enlarged.shape == expected_shape) + torch.testing.assert_close(predicted_depth_l, output_enlarged, atol=1e-3, rtol=1e-3) + + def test_export(self): + for strict in [True, False]: + with self.subTest(strict=strict): + if not is_torch_greater_or_equal_than_2_4: + self.skipTest(reason="This test requires torch >= 2.4 to run.") + model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade").to(torch_device).eval() + image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade") + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + exported_program = torch.export.export( + model, + args=(inputs["pixel_values"],), + strict=strict, + ) + with torch.no_grad(): + eager_outputs = model(**inputs) + exported_outputs = exported_program.module().forward(inputs["pixel_values"]) + self.assertEqual(eager_outputs.logits.shape, exported_outputs.logits.shape) + torch.testing.assert_close(eager_outputs.logits, exported_outputs.logits, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/docs/transformers/tests/models/dpt/test_modeling_dpt_auto_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..91863cc46eadd6d40d57f5e2e17caf7baa9935ba --- /dev/null +++ b/docs/transformers/tests/models/dpt/test_modeling_dpt_auto_backbone.py @@ -0,0 +1,316 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DPT model.""" + +import unittest + +from transformers import Dinov2Config, DPTConfig +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import DPTForDepthEstimation + from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES + + +if is_vision_available(): + from PIL import Image + + from transformers import DPTImageProcessor + + +class DPTModelTester: + def __init__( + self, + parent, + batch_size=2, + num_channels=3, + image_size=32, + patch_size=16, + use_labels=True, + num_labels=3, + is_training=True, + hidden_size=4, + num_hidden_layers=2, + num_attention_heads=2, + intermediate_size=8, + out_features=["stage1", "stage2"], + apply_layernorm=False, + reshape_hidden_states=False, + neck_hidden_sizes=[2, 2], + fusion_hidden_size=6, + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.patch_size = patch_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.out_features = out_features + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + self.use_labels = use_labels + self.num_labels = num_labels + self.is_training = is_training + self.neck_hidden_sizes = neck_hidden_sizes + self.fusion_hidden_size = fusion_hidden_size + # DPT's sequence length + self.seq_length = (self.image_size // self.patch_size) ** 2 + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DPTConfig( + backbone_config=self.get_backbone_config(), + backbone=None, + neck_hidden_sizes=self.neck_hidden_sizes, + fusion_hidden_size=self.fusion_hidden_size, + ) + + def get_backbone_config(self): + return Dinov2Config( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + is_training=self.is_training, + out_features=self.out_features, + reshape_hidden_states=self.reshape_hidden_states, + ) + + def create_and_check_for_depth_estimation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DPTForDepthEstimation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as DPT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (DPTForDepthEstimation,) if is_torch_available() else () + pipeline_model_mapping = {"depth-estimation": DPTForDepthEstimation} if is_torch_available() else {} + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = DPTModelTester(self) + self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="DPT with AutoBackbone does not have a base model and hence no input_embeddings") + def test_inputs_embeds(self): + pass + + def test_for_depth_estimation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + + def test_training(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DPTForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values(): + continue + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DPTForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing: + continue + model = model_class(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + # Skip the check for the backbone + backbone_params = [] + for name, module in model.named_modules(): + if module.__class__.__name__ == "DPTViTHybridEmbeddings": + backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()] + break + + for name, param in model.named_parameters(): + if param.requires_grad: + if name in backbone_params: + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + @unittest.skip(reason="DPT with AutoBackbone does not have a base model and hence no input_embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "Intel/dpt-large" + model = DPTForDepthEstimation.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +@slow +class DPTModelIntegrationTest(unittest.TestCase): + def test_inference_depth_estimation_dinov2(self): + image_processor = DPTImageProcessor.from_pretrained("facebook/dpt-dinov2-small-kitti") + model = DPTForDepthEstimation.from_pretrained("facebook/dpt-dinov2-small-kitti").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size((1, 576, 736)) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[6.0336, 7.1502, 7.4130], [6.8977, 7.2383, 7.2268], [7.9180, 8.0525, 8.0134]] + ).to(torch_device) + + torch.testing.assert_close(outputs.predicted_depth[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + def test_inference_depth_estimation_beit(self): + image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-beit-base-384") + model = DPTForDepthEstimation.from_pretrained("Intel/dpt-beit-base-384").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size((1, 384, 384)) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[2669.7061, 2663.7144, 2674.9399], [2633.9326, 2650.9092, 2665.4270], [2621.8271, 2632.0129, 2637.2290]] + ).to(torch_device) + + torch.testing.assert_close(outputs.predicted_depth[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + def test_inference_depth_estimation_swinv2(self): + image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-swinv2-tiny-256") + model = DPTForDepthEstimation.from_pretrained("Intel/dpt-swinv2-tiny-256").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size((1, 256, 256)) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[1032.7719, 1025.1886, 1030.2661], [1023.7619, 1021.0075, 1024.9121], [1022.5667, 1018.8522, 1021.4145]] + ).to(torch_device) + + torch.testing.assert_close(outputs.predicted_depth[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/dpt/test_modeling_dpt_hybrid.py b/docs/transformers/tests/models/dpt/test_modeling_dpt_hybrid.py new file mode 100644 index 0000000000000000000000000000000000000000..fbdd88278eaa5bd9842f01afbcbaae4b0ab92c0c --- /dev/null +++ b/docs/transformers/tests/models/dpt/test_modeling_dpt_hybrid.py @@ -0,0 +1,338 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DPT model.""" + +import unittest + +from transformers import DPTConfig +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import DPTForDepthEstimation, DPTForSemanticSegmentation, DPTModel + from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES + + +if is_vision_available(): + from PIL import Image + + from transformers import DPTImageProcessor + + +class DPTModelTester: + def __init__( + self, + parent, + batch_size=2, + image_size=32, + patch_size=16, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=4, + backbone_out_indices=[0, 1, 2, 3], + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + num_labels=3, + backbone_featmap_shape=[1, 32, 24, 24], + neck_hidden_sizes=[16, 16, 32, 32], + is_hybrid=True, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.backbone_out_indices = backbone_out_indices + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.num_labels = num_labels + self.backbone_featmap_shape = backbone_featmap_shape + self.scope = scope + self.is_hybrid = is_hybrid + self.neck_hidden_sizes = neck_hidden_sizes + # sequence length of DPT = num_patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + backbone_config = { + "global_padding": "same", + "layer_type": "bottleneck", + "depths": [3, 4, 9], + "out_features": ["stage1", "stage2", "stage3"], + "embedding_dynamic_padding": True, + "hidden_sizes": [16, 16, 32, 32], + "num_groups": 2, + } + + return DPTConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + fusion_hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + backbone_out_indices=self.backbone_out_indices, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + is_hybrid=self.is_hybrid, + backbone_config=backbone_config, + backbone=None, + backbone_featmap_shape=self.backbone_featmap_shape, + neck_hidden_sizes=self.neck_hidden_sizes, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DPTModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_depth_estimation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DPTForDepthEstimation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + + def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DPTForSemanticSegmentation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_labels, self.image_size, self.image_size) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DPTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as DPT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (DPTModel, DPTForDepthEstimation, DPTForSemanticSegmentation) if is_torch_available() else () + pipeline_model_mapping = ( + { + "depth-estimation": DPTForDepthEstimation, + "feature-extraction": DPTModel, + "image-segmentation": DPTForSemanticSegmentation, + } + if is_torch_available() + else {} + ) + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = DPTModelTester(self) + self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="DPT does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_depth_estimation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + + def test_for_semantic_segmentation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs) + + def test_training(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DPTForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values(): + continue + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DPTForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing: + continue + model = model_class(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + # Skip the check for the backbone + backbone_params = [] + for name, module in model.named_modules(): + if module.__class__.__name__ == "DPTViTHybridEmbeddings": + backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()] + break + + for name, param in model.named_parameters(): + if param.requires_grad: + if name in backbone_params: + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + @slow + def test_model_from_pretrained(self): + model_name = "Intel/dpt-hybrid-midas" + model = DPTModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_raise_readout_type(self): + # We do this test only for DPTForDepthEstimation since it is the only model that uses readout_type + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + config.readout_type = "add" + with self.assertRaises(ValueError): + _ = DPTForDepthEstimation(config) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +@slow +class DPTModelIntegrationTest(unittest.TestCase): + def test_inference_depth_estimation(self): + image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas") + model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size((1, 384, 384)) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[[5.6437, 5.6146, 5.6511], [5.4371, 5.5649, 5.5958], [5.5215, 5.5184, 5.5293]]] + ).to(torch_device) + + torch.testing.assert_close(outputs.predicted_depth[:3, :3, :3] / 100, expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/efficientnet/__init__.py b/docs/transformers/tests/models/efficientnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/efficientnet/test_image_processing_efficientnet.py b/docs/transformers/tests/models/efficientnet/test_image_processing_efficientnet.py new file mode 100644 index 0000000000000000000000000000000000000000..cb8fc8d92209c5246b93e3e6c941c5d4ab76d118 --- /dev/null +++ b/docs/transformers/tests/models/efficientnet/test_image_processing_efficientnet.py @@ -0,0 +1,190 @@ +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers.image_utils import PILImageResampling +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import ( + is_torch_available, + is_torchvision_available, + is_vision_available, +) + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from transformers import EfficientNetImageProcessor + + if is_torchvision_available(): + from transformers import EfficientNetImageProcessorFast + + +class EfficientNetImageProcessorTester: + def __init__( + self, + parent, + batch_size=13, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_rescale=True, + rescale_offset=True, + rescale_factor=1 / 127.5, + resample=PILImageResampling.BILINEAR, # NEAREST is too different between PIL and torchvision + ): + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.resample = resample + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + "resample": self.resample, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class EfficientNetImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = EfficientNetImageProcessor if is_vision_available() else None + fast_image_processing_class = EfficientNetImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = EfficientNetImageProcessorTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + def test_rescale(self): + # EfficientNet optionally rescales between -1 and 1 instead of the usual 0 and 1 + image = np.arange(0, 256, 1, dtype=np.uint8).reshape(1, 8, 32) + + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class(**self.image_processor_dict) + if image_processing_class == EfficientNetImageProcessorFast: + image = torch.from_numpy(image) + + # Scale between [-1, 1] with rescale_factor 1/127.5 and rescale_offset=True + rescaled_image = image_processor.rescale(image, scale=1 / 127.5, offset=True) + expected_image = (image * (1 / 127.5)) - 1 + self.assertTrue(torch.allclose(rescaled_image, expected_image)) + + # Scale between [0, 1] with rescale_factor 1/255 and rescale_offset=True + rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False) + expected_image = image / 255.0 + self.assertTrue(torch.allclose(rescaled_image, expected_image)) + + else: + rescaled_image = image_processor.rescale(image, scale=1 / 127.5, dtype=np.float64) + expected_image = (image * (1 / 127.5)).astype(np.float64) - 1 + self.assertTrue(np.allclose(rescaled_image, expected_image)) + + rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False, dtype=np.float64) + expected_image = (image / 255.0).astype(np.float64) + self.assertTrue(np.allclose(rescaled_image, expected_image)) + + @require_vision + @require_torch + def test_rescale_normalize(self): + if self.image_processing_class is None or self.fast_image_processing_class is None: + self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined") + + image = torch.arange(0, 256, 1, dtype=torch.uint8).reshape(1, 8, 32).repeat(3, 1, 1) + image_mean_0 = (0.0, 0.0, 0.0) + image_std_0 = (1.0, 1.0, 1.0) + image_mean_1 = (0.5, 0.5, 0.5) + image_std_1 = (0.5, 0.5, 0.5) + + image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict) + + # Rescale between [-1, 1] with rescale_factor=1/127.5 and rescale_offset=True. Then normalize + rescaled_normalized = image_processor_fast.rescale_and_normalize( + image, True, 1 / 127.5, True, image_mean_0, image_std_0, True + ) + expected_image = (image * (1 / 127.5)) - 1 + expected_image = (expected_image - torch.tensor(image_mean_0).view(3, 1, 1)) / torch.tensor(image_std_0).view( + 3, 1, 1 + ) + self.assertTrue(torch.allclose(rescaled_normalized, expected_image, rtol=1e-3)) + + # Rescale between [0, 1] with rescale_factor=1/255 and rescale_offset=False. Then normalize + rescaled_normalized = image_processor_fast.rescale_and_normalize( + image, True, 1 / 255, True, image_mean_1, image_std_1, False + ) + expected_image = image * (1 / 255.0) + expected_image = (expected_image - torch.tensor(image_mean_1).view(3, 1, 1)) / torch.tensor(image_std_1).view( + 3, 1, 1 + ) + self.assertTrue(torch.allclose(rescaled_normalized, expected_image, rtol=1e-3)) diff --git a/docs/transformers/tests/models/efficientnet/test_modeling_efficientnet.py b/docs/transformers/tests/models/efficientnet/test_modeling_efficientnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f0706ad1536a6ce1fd163849bddf9731b9f0a9b4 --- /dev/null +++ b/docs/transformers/tests/models/efficientnet/test_modeling_efficientnet.py @@ -0,0 +1,262 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch EfficientNet model.""" + +import unittest + +from transformers import EfficientNetConfig +from transformers.testing_utils import is_pipeline_test, require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import EfficientNetForImageClassification, EfficientNetModel + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class EfficientNetModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + num_channels=3, + kernel_sizes=[3, 3, 5], + in_channels=[32, 16, 24], + out_channels=[16, 24, 20], + strides=[1, 1, 2], + num_block_repeats=[1, 1, 2], + expand_ratios=[1, 6, 6], + is_training=True, + use_labels=True, + intermediate_size=37, + hidden_act="gelu", + num_labels=10, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.kernel_sizes = kernel_sizes + self.in_channels = in_channels + self.out_channels = out_channels + self.strides = strides + self.num_block_repeats = num_block_repeats + self.expand_ratios = expand_ratios + self.is_training = is_training + self.hidden_act = hidden_act + self.num_labels = num_labels + self.use_labels = use_labels + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.num_labels) + + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return EfficientNetConfig( + image_size=self.image_size, + num_channels=self.num_channels, + kernel_sizes=self.kernel_sizes, + in_channels=self.in_channels, + out_channels=self.out_channels, + strides=self.strides, + num_block_repeats=self.num_block_repeats, + expand_ratios=self.expand_ratios, + hidden_act=self.hidden_act, + num_labels=self.num_labels, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = EfficientNetModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + # expected last hidden states: B, C, H // 4, W // 4 + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, config.hidden_dim, self.image_size // 4, self.image_size // 4), + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + model = EfficientNetForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class EfficientNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as EfficientNet does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (EfficientNetModel, EfficientNetForImageClassification) if is_torch_available() else () + pipeline_model_mapping = ( + {"image-feature-extraction": EfficientNetModel, "image-classification": EfficientNetForImageClassification} + if is_torch_available() + else {} + ) + + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = EfficientNetModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=EfficientNetConfig, + has_text_modality=False, + hidden_size=37, + common_properties=["num_channels", "image_size", "hidden_dim"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="EfficientNet does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="EfficientNet does not support input and output embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="EfficientNet does not use feedforward chunking") + def test_feed_forward_chunking(self): + pass + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + num_blocks = sum(config.num_block_repeats) * 4 + self.assertEqual(len(hidden_states), num_blocks) + + # EfficientNet's feature maps are of shape (batch_size, num_channels, height, width) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [self.model_tester.image_size // 2, self.model_tester.image_size // 2], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "google/efficientnet-b7" + model = EfficientNetModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @is_pipeline_test + @require_vision + @slow + def test_pipeline_image_feature_extraction(self): + super().test_pipeline_image_feature_extraction() + + @is_pipeline_test + @require_vision + @slow + def test_pipeline_image_feature_extraction_fp16(self): + super().test_pipeline_image_feature_extraction_fp16() + + @is_pipeline_test + @require_vision + @slow + def test_pipeline_image_classification(self): + super().test_pipeline_image_classification() + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class EfficientNetModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("google/efficientnet-b7") if is_vision_available() else None + + @slow + def test_inference_image_classification_head(self): + model = EfficientNetForImageClassification.from_pretrained("google/efficientnet-b7").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor([-0.2962, 0.4487, 0.4499]).to(torch_device) + torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/electra/__init__.py b/docs/transformers/tests/models/electra/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/electra/test_modeling_electra.py b/docs/transformers/tests/models/electra/test_modeling_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..7d451ff6378acf8b1274bfe29023ab37a29bd654 --- /dev/null +++ b/docs/transformers/tests/models/electra/test_modeling_electra.py @@ -0,0 +1,489 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import ElectraConfig, is_torch_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + MODEL_FOR_PRETRAINING_MAPPING, + ElectraForCausalLM, + ElectraForMaskedLM, + ElectraForMultipleChoice, + ElectraForPreTraining, + ElectraForQuestionAnswering, + ElectraForSequenceClassification, + ElectraForTokenClassification, + ElectraModel, + ) + + +class ElectraModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1) + + config = self.get_config() + + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ) + + def get_config(self): + return ElectraConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + _, + ) = self.prepare_config_and_inputs() + + config.is_decoder = True + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_electra_model( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + model = ElectraModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_electra_model_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.add_cross_attention = True + model = ElectraModel(config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_electra_for_masked_lm( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + model = ElectraForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_electra_for_causal_lm( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + model = ElectraForCausalLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_electra_for_token_classification( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + config.num_labels = self.num_labels + model = ElectraForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_electra_for_pretraining( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + config.num_labels = self.num_labels + model = ElectraForPreTraining(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_electra_for_sequence_classification( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + config.num_labels = self.num_labels + model = ElectraForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_electra_for_question_answering( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + model = ElectraForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_electra_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + config.num_choices = self.num_choices + model = ElectraForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class ElectraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + ElectraModel, + ElectraForPreTraining, + ElectraForMaskedLM, + ElectraForCausalLM, + ElectraForMultipleChoice, + ElectraForTokenClassification, + ElectraForSequenceClassification, + ElectraForQuestionAnswering, + ) + if is_torch_available() + else () + ) + # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante + all_generative_model_classes = () + pipeline_model_mapping = ( + { + "feature-extraction": ElectraModel, + "fill-mask": ElectraForMaskedLM, + "question-answering": ElectraForQuestionAnswering, + "text-classification": ElectraForSequenceClassification, + "text-generation": ElectraForCausalLM, + "token-classification": ElectraForTokenClassification, + "zero-shot": ElectraForSequenceClassification, + } + if is_torch_available() + else {} + ) + fx_compatible = True + + # special case for ForPreTraining model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING): + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device + ) + return inputs_dict + + def setUp(self): + self.model_tester = ElectraModelTester(self) + self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_electra_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_electra_model(*config_and_inputs) + + def test_electra_model_as_decoder(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_electra_model_as_decoder(*config_and_inputs) + + def test_electra_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_electra_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_electra_for_masked_lm(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_electra_for_token_classification(*config_and_inputs) + + def test_for_pre_training(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_electra_for_pretraining(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_electra_for_sequence_classification(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_electra_for_question_answering(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_electra_for_multiple_choice(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "google/electra-small-generator" + model = ElectraModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_for_causal_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_electra_for_causal_lm(*config_and_inputs) + + +@require_torch +class ElectraModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_no_head_absolute_embedding(self): + model = ElectraModel.from_pretrained("google/electra-small-discriminator") + input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + output = model(input_ids, attention_mask=attention_mask)[0] + expected_shape = torch.Size((1, 11, 256)) + self.assertEqual(output.shape, expected_shape) + expected_slice = torch.tensor( + [[[0.4471, 0.6821, -0.3265], [0.4627, 0.5255, -0.3668], [0.4532, 0.3313, -0.4344]]] + ) + + torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/electra/test_modeling_flax_electra.py b/docs/transformers/tests/models/electra/test_modeling_flax_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..698a492fc3c7972f7fdbdeca6ba53537e3819d0d --- /dev/null +++ b/docs/transformers/tests/models/electra/test_modeling_flax_electra.py @@ -0,0 +1,136 @@ +import unittest + +import numpy as np + +from transformers import ElectraConfig, is_flax_available +from transformers.testing_utils import require_flax, slow + +from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask + + +if is_flax_available(): + from transformers.models.electra.modeling_flax_electra import ( + FlaxElectraForCausalLM, + FlaxElectraForMaskedLM, + FlaxElectraForMultipleChoice, + FlaxElectraForPreTraining, + FlaxElectraForQuestionAnswering, + FlaxElectraForSequenceClassification, + FlaxElectraForTokenClassification, + FlaxElectraModel, + ) + + +class FlaxElectraModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_attention_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + embedding_size=24, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_choices=4, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_attention_mask = use_attention_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.embedding_size = embedding_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_choices = num_choices + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + attention_mask = None + if self.use_attention_mask: + attention_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + config = ElectraConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + embedding_size=self.embedding_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + ) + + return config, input_ids, token_type_ids, attention_mask + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, token_type_ids, attention_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask} + return config, inputs_dict + + +@require_flax +class FlaxElectraModelTest(FlaxModelTesterMixin, unittest.TestCase): + test_head_masking = True + + all_model_classes = ( + ( + FlaxElectraModel, + FlaxElectraForCausalLM, + FlaxElectraForMaskedLM, + FlaxElectraForPreTraining, + FlaxElectraForTokenClassification, + FlaxElectraForQuestionAnswering, + FlaxElectraForMultipleChoice, + FlaxElectraForSequenceClassification, + ) + if is_flax_available() + else () + ) + + def setUp(self): + self.model_tester = FlaxElectraModelTester(self) + + @slow + def test_model_from_pretrained(self): + for model_class_name in self.all_model_classes: + if model_class_name == FlaxElectraForMaskedLM: + model = model_class_name.from_pretrained("google/electra-small-generator") + else: + model = model_class_name.from_pretrained("google/electra-small-discriminator") + outputs = model(np.ones((1, 1))) + self.assertIsNotNone(outputs) diff --git a/docs/transformers/tests/models/electra/test_modeling_tf_electra.py b/docs/transformers/tests/models/electra/test_modeling_tf_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..2b7e8d1a18f3f8dc5a2d035134eecb8e619dd9e6 --- /dev/null +++ b/docs/transformers/tests/models/electra/test_modeling_tf_electra.py @@ -0,0 +1,615 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import unittest + +from transformers import ElectraConfig, is_tf_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers.models.electra.modeling_tf_electra import ( + TFElectraForMaskedLM, + TFElectraForMultipleChoice, + TFElectraForPreTraining, + TFElectraForQuestionAnswering, + TFElectraForSequenceClassification, + TFElectraForTokenClassification, + TFElectraModel, + ) + + +class TFElectraModelTester: + def __init__( + self, + parent, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_input_mask = True + self.use_token_type_ids = True + self.use_labels = True + self.vocab_size = 99 + self.hidden_size = 32 + self.num_hidden_layers = 2 + self.num_attention_heads = 4 + self.intermediate_size = 37 + self.hidden_act = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.scope = None + self.embedding_size = 128 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = ElectraConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + ) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + config.is_decoder = True + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFElectraModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + + inputs = [input_ids, input_mask] + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_causal_lm_base_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.is_decoder = True + + model = TFElectraModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + + inputs = [input_ids, input_mask] + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_model_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.add_cross_attention = True + + model = TFElectraModel(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + "encoder_hidden_states": encoder_hidden_states, + "encoder_attention_mask": encoder_attention_mask, + } + result = model(inputs) + + inputs = [input_ids, input_mask] + result = model(inputs, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states) + + # Also check the case where encoder outputs are not passed + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_causal_lm_base_model_past( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + config.is_decoder = True + + model = TFElectraModel(config=config) + + # first forward pass + outputs = model(input_ids, use_cache=True) + outputs_use_cache_conf = model(input_ids) + outputs_no_past = model(input_ids, use_cache=False) + + self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) + self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) + + past_key_values = outputs.past_key_values + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + # append to next input_ids and attn_mask + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + + output_from_no_past = model(next_input_ids, output_hidden_states=True).hidden_states[0] + output_from_past = model( + next_tokens, past_key_values=past_key_values, output_hidden_states=True + ).hidden_states[0] + + # select random slice + random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1])) + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx] + output_from_past_slice = output_from_past[:, 0, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6) + + def create_and_check_causal_lm_base_model_past_with_attn_mask( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + config.is_decoder = True + + model = TFElectraModel(config=config) + + # create attention mask + half_seq_length = self.seq_length // 2 + attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32) + attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32) + attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1) + + # first forward pass + outputs = model(input_ids, attention_mask=attn_mask, use_cache=True) + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + past_key_values = outputs.past_key_values + + # change a random masked slice from input_ids + random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1 + random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size) + vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change) + condition = tf.transpose( + tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size)) + ) + input_ids = tf.where(condition, random_other_next_tokens, input_ids) + + # append to next input_ids and + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + attn_mask = tf.concat( + [attn_mask, tf.ones((attn_mask.shape[0], 1), dtype=tf.int32)], + axis=1, + ) + + output_from_no_past = model( + next_input_ids, + attention_mask=attn_mask, + output_hidden_states=True, + ).hidden_states[0] + output_from_past = model( + next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, output_hidden_states=True + ).hidden_states[0] + + # select random slice + random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1])) + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx] + output_from_past_slice = output_from_past[:, 0, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6) + + def create_and_check_causal_lm_base_model_past_large_inputs( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + config.is_decoder = True + + model = TFElectraModel(config=config) + + input_ids = input_ids[:1, :] + input_mask = input_mask[:1, :] + self.batch_size = 1 + + # first forward pass + outputs = model(input_ids, attention_mask=input_mask, use_cache=True) + past_key_values = outputs.past_key_values + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_attn_mask = ids_tensor((self.batch_size, 3), 2) + + # append to next input_ids and + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1) + + output_from_no_past = model( + next_input_ids, + attention_mask=next_attention_mask, + output_hidden_states=True, + ).hidden_states[0] + output_from_past = model( + next_tokens, + attention_mask=next_attention_mask, + past_key_values=past_key_values, + output_hidden_states=True, + ).hidden_states[0] + + self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1]) + + # select random slice + random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1])) + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx] + output_from_past_slice = output_from_past[:, :, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3) + + def create_and_check_decoder_model_past_large_inputs( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.add_cross_attention = True + + model = TFElectraModel(config=config) + + input_ids = input_ids[:1, :] + input_mask = input_mask[:1, :] + encoder_hidden_states = encoder_hidden_states[:1, :, :] + encoder_attention_mask = encoder_attention_mask[:1, :] + self.batch_size = 1 + + # first forward pass + outputs = model( + input_ids, + attention_mask=input_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=True, + ) + past_key_values = outputs.past_key_values + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_attn_mask = ids_tensor((self.batch_size, 3), 2) + + # append to next input_ids and + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1) + + output_from_no_past = model( + next_input_ids, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_hidden_states=True, + ).hidden_states[0] + output_from_past = model( + next_tokens, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_hidden_states=True, + ).hidden_states[0] + + self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1]) + + # select random slice + random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1])) + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx] + output_from_past_slice = output_from_past[:, :, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFElectraForMaskedLM(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_pretraining( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFElectraForPreTraining(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFElectraForSequenceClassification(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = TFElectraForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFElectraForQuestionAnswering(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFElectraForTokenClassification(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFElectraModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + TFElectraModel, + TFElectraForMaskedLM, + TFElectraForPreTraining, + TFElectraForTokenClassification, + TFElectraForMultipleChoice, + TFElectraForSequenceClassification, + TFElectraForQuestionAnswering, + ) + if is_tf_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": TFElectraModel, + "fill-mask": TFElectraForMaskedLM, + "question-answering": TFElectraForQuestionAnswering, + "text-classification": TFElectraForSequenceClassification, + "token-classification": TFElectraForTokenClassification, + "zero-shot": TFElectraForSequenceClassification, + } + if is_tf_available() + else {} + ) + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFElectraModelTester(self) + self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + """Test the base model""" + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_causal_lm_base_model(self): + """Test the base model of the causal LM model + + is_deocder=True, no cross_attention, no encoder outputs + """ + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_causal_lm_base_model(*config_and_inputs) + + def test_model_as_decoder(self): + """Test the base model as a decoder (of an encoder-decoder architecture) + + is_deocder=True + cross_attention + pass encoder outputs + """ + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) + + def test_causal_lm_base_model_past(self): + """Test causal LM base model with `past_key_values`""" + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_causal_lm_base_model_past(*config_and_inputs) + + def test_causal_lm_base_model_past_with_attn_mask(self): + """Test the causal LM base model with `past_key_values` and `attention_mask`""" + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_causal_lm_base_model_past_with_attn_mask(*config_and_inputs) + + def test_causal_lm_base_model_past_with_large_inputs(self): + """Test the causal LM base model with `past_key_values` and a longer decoder sequence length""" + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_causal_lm_base_model_past_large_inputs(*config_and_inputs) + + def test_decoder_model_past_with_large_inputs(self): + """Similar to `test_causal_lm_base_model_past_with_large_inputs` but with cross-attention""" + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_pretraining(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_pretraining(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + # model_name = 'google/electra-small-generator' + for model_name in ["google/electra-small-discriminator"]: + model = TFElectraModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_tf +class TFElectraModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_masked_lm(self): + model = TFElectraForPreTraining.from_pretrained("lysandre/tiny-electra-random") + input_ids = tf.constant([[0, 1, 2, 3, 4, 5]]) + output = model(input_ids)[0] + + expected_shape = [1, 6] + self.assertEqual(output.shape, expected_shape) + + print(output[:, :3]) + + expected_slice = tf.constant([[-0.24651965, 0.8835437, 1.823782]]) + tf.debugging.assert_near(output[:, :3], expected_slice, atol=1e-4) diff --git a/docs/transformers/tests/models/electra/test_tokenization_electra.py b/docs/transformers/tests/models/electra/test_tokenization_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..4c736e167082cb90c16cb37e684b5fbce065cc03 --- /dev/null +++ b/docs/transformers/tests/models/electra/test_tokenization_electra.py @@ -0,0 +1,336 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +from transformers import ElectraTokenizerFast +from transformers.models.electra.tokenization_electra import ( + VOCAB_FILES_NAMES, + BasicTokenizer, + ElectraTokenizer, + WordpieceTokenizer, + _is_control, + _is_punctuation, + _is_whitespace, +) +from transformers.testing_utils import require_tokenizers, slow + +from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english + + +@require_tokenizers +class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/electra-small-generator" + tokenizer_class = ElectraTokenizer + rust_tokenizer_class = ElectraTokenizerFast + test_rust_tokenizer = True + space_between_special_tokens = True + from_pretrained_filter = filter_non_english + + @classmethod + def setUpClass(cls): + super().setUpClass() + + vocab_tokens = [ + "[UNK]", + "[CLS]", + "[SEP]", + "[PAD]", + "[MASK]", + "want", + "##want", + "##ed", + "wa", + "un", + "runn", + "##ing", + ",", + "low", + "lowest", + ] + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + def get_input_output_texts(self, tokenizer): + input_text = "UNwant\u00e9d,running" + output_text = "unwanted, running" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = self.tokenizer_class(self.vocab_file) + + tokens = tokenizer.tokenize("UNwant\u00e9d,running") + self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11]) + + def test_rust_and_python_full_tokenizers(self): + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + sequence = "UNwant\u00e9d,running" + + tokens = tokenizer.tokenize(sequence) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(tokens, rust_tokens) + + ids = tokenizer.encode(sequence, add_special_tokens=False) + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + rust_tokenizer = self.get_rust_tokenizer() + ids = tokenizer.encode(sequence) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + # With lower casing + tokenizer = self.get_tokenizer(do_lower_case=True) + rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True) + + sequence = "UNwant\u00e9d,running" + + tokens = tokenizer.tokenize(sequence) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(tokens, rust_tokens) + + ids = tokenizer.encode(sequence, add_special_tokens=False) + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + rust_tokenizer = self.get_rust_tokenizer() + ids = tokenizer.encode(sequence) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + def test_chinese(self): + tokenizer = BasicTokenizer() + + self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"]) + + def test_basic_tokenizer_lower(self): + tokenizer = BasicTokenizer(do_lower_case=True) + + self.assertListEqual( + tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"] + ) + self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"]) + + def test_basic_tokenizer_lower_strip_accents_false(self): + tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False) + + self.assertListEqual( + tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hällo", "!", "how", "are", "you", "?"] + ) + self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"]) + + def test_basic_tokenizer_lower_strip_accents_true(self): + tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True) + + self.assertListEqual( + tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"] + ) + self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"]) + + def test_basic_tokenizer_lower_strip_accents_default(self): + tokenizer = BasicTokenizer(do_lower_case=True) + + self.assertListEqual( + tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"] + ) + self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"]) + + def test_basic_tokenizer_no_lower(self): + tokenizer = BasicTokenizer(do_lower_case=False) + + self.assertListEqual( + tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"] + ) + + def test_basic_tokenizer_no_lower_strip_accents_false(self): + tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False) + + self.assertListEqual( + tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HäLLo", "!", "how", "Are", "yoU", "?"] + ) + + def test_basic_tokenizer_no_lower_strip_accents_true(self): + tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True) + + self.assertListEqual( + tokenizer.tokenize(" \tHäLLo!how \n Are yoU? "), ["HaLLo", "!", "how", "Are", "yoU", "?"] + ) + + def test_basic_tokenizer_respects_never_split_tokens(self): + tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"]) + + self.assertListEqual( + tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"] + ) + + def test_wordpiece_tokenizer(self): + vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"] + + vocab = {} + for i, token in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]") + + self.assertListEqual(tokenizer.tokenize(""), []) + + self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) + + self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) + + def test_is_whitespace(self): + self.assertTrue(_is_whitespace(" ")) + self.assertTrue(_is_whitespace("\t")) + self.assertTrue(_is_whitespace("\r")) + self.assertTrue(_is_whitespace("\n")) + self.assertTrue(_is_whitespace("\u00a0")) + + self.assertFalse(_is_whitespace("A")) + self.assertFalse(_is_whitespace("-")) + + def test_is_control(self): + self.assertTrue(_is_control("\u0005")) + + self.assertFalse(_is_control("A")) + self.assertFalse(_is_control(" ")) + self.assertFalse(_is_control("\t")) + self.assertFalse(_is_control("\r")) + + def test_is_punctuation(self): + self.assertTrue(_is_punctuation("-")) + self.assertTrue(_is_punctuation("$")) + self.assertTrue(_is_punctuation("`")) + self.assertTrue(_is_punctuation(".")) + + self.assertFalse(_is_punctuation("A")) + self.assertFalse(_is_punctuation(" ")) + + def test_clean_text(self): + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340 + self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]) + + self.assertListEqual( + [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]] + ) + + @slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained("google/electra-base-discriminator") + + text = tokenizer.encode("sequence builders", add_special_tokens=False) + text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_sentence == [101] + text + [102] + assert encoded_pair == [101] + text + [102] + text_2 + [102] + + def test_offsets_with_special_characters(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + + sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." + tokens = tokenizer_r.encode_plus( + sentence, + return_attention_mask=False, + return_token_type_ids=False, + return_offsets_mapping=True, + add_special_tokens=True, + ) + + do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False + expected_results = ( + [ + ((0, 0), tokenizer_r.cls_token), + ((0, 1), "A"), + ((1, 2), ","), + ((3, 5), "na"), + ((5, 6), "##ï"), + ((6, 8), "##ve"), + ((9, 15), tokenizer_r.mask_token), + ((16, 21), "Allen"), + ((21, 23), "##NL"), + ((23, 24), "##P"), + ((25, 33), "sentence"), + ((33, 34), "."), + ((0, 0), tokenizer_r.sep_token), + ] + if not do_lower_case + else [ + ((0, 0), tokenizer_r.cls_token), + ((0, 1), "a"), + ((1, 2), ","), + ((3, 8), "naive"), + ((9, 15), tokenizer_r.mask_token), + ((16, 21), "allen"), + ((21, 23), "##nl"), + ((23, 24), "##p"), + ((25, 33), "sentence"), + ((33, 34), "."), + ((0, 0), tokenizer_r.sep_token), + ] + ) + + self.assertEqual( + [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"]) + ) + self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"]) + + def test_change_tokenize_chinese_chars(self): + list_of_commun_chinese_char = ["的", "人", "有"] + text_with_chinese_char = "".join(list_of_commun_chinese_char) + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + kwargs["tokenize_chinese_chars"] = True + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + + ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) + ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) + + tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r) + tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p) + + # it is expected that each Chinese character is not preceded by "##" + self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char) + self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char) + + kwargs["tokenize_chinese_chars"] = False + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + + ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) + ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) + + tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r) + tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p) + + # it is expected that only the first Chinese character is not preceded by "##". + expected_tokens = [ + f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char) + ] + self.assertListEqual(tokens_without_spe_char_p, expected_tokens) + self.assertListEqual(tokens_without_spe_char_r, expected_tokens) diff --git a/docs/transformers/tests/models/emu3/__init__.py b/docs/transformers/tests/models/emu3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/emu3/test_modeling_emu3.py b/docs/transformers/tests/models/emu3/test_modeling_emu3.py new file mode 100644 index 0000000000000000000000000000000000000000..b27b1d4c708a6eb7f459bf5019c6e23ea12f7dea --- /dev/null +++ b/docs/transformers/tests/models/emu3/test_modeling_emu3.py @@ -0,0 +1,545 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch emu3 model.""" + +import unittest + +import numpy as np +import pytest +import requests +from huggingface_hub import hf_hub_download +from parameterized import parameterized + +from transformers import Emu3Config, Emu3TextConfig, is_torch_available, is_vision_available, set_seed +from transformers.testing_utils import ( + Expectations, + require_bitsandbytes, + require_torch, + require_torch_large_accelerator, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_vision_available(): + from PIL import Image + +if is_torch_available(): + import torch + + from transformers import ( + Emu3ForCausalLM, + Emu3ForConditionalGeneration, + Emu3Processor, + Emu3TextModel, + ) + + +class Emu3Text2TextModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=False, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=2, + num_key_value_heads=2, + intermediate_size=37, + max_position_embeddings=512, + initializer_range=0.02, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.intermediate_size = intermediate_size + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + attention_mask = input_ids.ne(1).to(torch_device) + + config = self.get_config() + + return config, input_ids, attention_mask + + def get_config(self): + return Emu3TextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + num_key_value_heads=self.num_key_value_heads, + intermediate_size=self.intermediate_size, + max_position_embeddings=self.max_position_embeddings, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + attention_mask, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + return config, inputs_dict + + +@require_torch +class Emu3Text2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (Emu3ForCausalLM,) if is_torch_available() else () + pipeline_model_mapping = ( + { + "text-generation": Emu3ForCausalLM, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + fx_compatible = False + + def setUp(self): + self.model_tester = Emu3Text2TextModelTester(self) + self.config_tester = ConfigTester(self, config_class=Emu3TextConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @parameterized.expand([("linear",), ("dynamic",)]) + def test_model_rope_scaling(self, scaling_type): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + short_input = ids_tensor([1, 10], config.vocab_size) + long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + original_model = Emu3TextModel(config) + original_model.to(torch_device) + original_model.eval() + original_short_output = original_model(short_input).last_hidden_state + original_long_output = original_model(long_input).last_hidden_state + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + config.rope_scaling = {"type": scaling_type, "factor": 10.0} + scaled_model = Emu3TextModel(config) + scaled_model.to(torch_device) + scaled_model.eval() + scaled_short_output = scaled_model(short_input).last_hidden_state + scaled_long_output = scaled_model(long_input).last_hidden_state + + # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original + # maximum sequence length, so the outputs for the short input should match. + if scaling_type == "dynamic": + torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) + else: + self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) + + # The output should be different for long inputs + self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + + @unittest.skip("Doesn't work, tensors are not almost same") # TODO raushan fixme + def test_custom_4d_attention_mask(self): + pass + + +class Emu3Vision2TextModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=False, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=2, + num_key_value_heads=2, + intermediate_size=37, + max_position_embeddings=512, + initializer_range=0.02, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + image_token_id=3, + image_size=30, + codebook_size=20, + temporal_downsample_factor=1, + base_channels=32, + vq_channel_multiplier=[1, 1], + image_seq_length=100, + vq_img_token_start_id=3, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.intermediate_size = intermediate_size + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.image_token_id = image_token_id + self.image_size = image_size + self.codebook_size = codebook_size + self.temporal_downsample_factor = temporal_downsample_factor + self.vq_channel_multiplier = vq_channel_multiplier + self.vq_img_token_start_id = vq_img_token_start_id + self.base_channels = base_channels + self.seq_length = seq_length + image_seq_length + self.image_seq_length = image_seq_length + + def prepare_config_and_inputs(self): + config = self.get_config() + + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size) + attention_mask = input_ids.ne(1).to(torch_device) + input_ids[input_ids == self.image_token_id] = self.pad_token_id + input_ids[:, : self.image_seq_length] = self.image_token_id + + pixel_values = floats_tensor( + [ + self.batch_size, + 3, + self.image_size, + self.image_size, + ] + ) + image_sizes = [[self.image_size, self.image_size]] * self.batch_size + image_sizes = torch.tensor(image_sizes, device=torch_device, dtype=torch.int64) + + return config, input_ids, attention_mask, pixel_values, image_sizes + + def get_config(self): + # create dummy vocab map for image2bpe mapping if it needs remapping + # we assume that vocab size is big enough to account for `codebook_size` amount of + # image tokens somewhere at the beginning of total vocab size + + vocab_map = {i: chr(i) for i in range(self.vocab_size)} + start = self.vq_img_token_start_id + end = self.vq_img_token_start_id + self.codebook_size + for i in range(start, end): + # dummy str for each token, anything that fits pattern "<|visual token XXXXXX|>" + vocab_map[i] = f"<|visual token{i:06d}|>" + + # add tokens that have to be in the vocab, we'll retrieve their ids later in modeling code + vocab_map[self.image_token_id] = "" + vocab_map[self.image_token_id + 1] = "<|extra_200|>" + vocab_map = {v: k for k, v in vocab_map.items()} + + text_config = Emu3TextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + num_key_value_heads=self.num_key_value_heads, + intermediate_size=self.intermediate_size, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + ) + + vq_config = { + "codebook_size": self.codebook_size, + "temporal_downsample_factor": self.temporal_downsample_factor, + "base_channels": self.base_channels, + "channel_multiplier": self.vq_channel_multiplier, + "hidden_size": self.base_channels, + } + return Emu3Config(text_config=text_config, vq_config=vq_config, vocabulary_map=vocab_map) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + attention_mask, + pixel_values, + image_sizes, + ) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "image_sizes": image_sizes, + } + return config, inputs_dict + + +@require_torch +class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (Emu3ForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {} + test_headmasking = False + test_pruning = False + fx_compatible = False + + def setUp(self): + self.model_tester = Emu3Vision2TextModelTester(self) + self.config_tester = ConfigTester( + self, config_class=Emu3Config, has_text_modality=False, common_properties=["vocabulary_map"] + ) + + def test_config(self): + self.config_tester.run_common_tests() + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + # while some other models require pixel_values to be present + def test_inputs_embeds_matches_input_ids(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + inputs_embeds = model.get_input_embeddings()(input_ids) + + with torch.no_grad(): + out_ids = model(input_ids=input_ids, **inputs)[0] + out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] + torch.testing.assert_close(out_embeds, out_ids) + + @unittest.skip( + "Emu3 has a VQ module that uses `weight.data` directly in forward which prevent offloding on that module" + ) + def test_disk_offload_safetensors(self): + pass + + @unittest.skip( + "Emu3 has a VQ module that uses `weight.data` directly in forward which prevent offloding on that module" + ) + def test_disk_offload_bin(self): + pass + + @unittest.skip( + "Emu3 has a VQ module that uses `weight.data` directly in forward which prevent offloding on that module" + ) + def test_cpu_offload(self): + pass + + @unittest.skip("VQ-VAE module doesn't initialize weights properly") + def test_initialization(self): + pass + + @pytest.mark.generate + @unittest.skip("Emu3 has dynamic control flow in vision backbone") + def test_generate_with_static_cache(self): + pass + + +@require_torch +class Emu3IntegrationTest(unittest.TestCase): + @slow + @require_bitsandbytes + def test_model_generation(self): + model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True) + processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf") + + image = Image.open(requests.get("https://picsum.photos/id/237/200/200", stream=True).raw) + prompt = "USER: Describe what do you see here and tell me about the history behind it? ASSISTANT:" + + inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.float16) + + # greedy generation outputs + EXPECTED_TEXT_COMPLETION = ['USER: 64*64Describe what do you see here and tell me about the history behind it? ASSISTANT: The image captures a moment of tranquility with a black Labrador Retriever resting on a wooden floor. The dog, with its glossy black coat, is lying down with its front legs stretched out in'] # fmt: skip + generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False) + text = processor.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, text) + + @slow + @require_bitsandbytes + @require_torch_large_accelerator + def test_model_generation_batched(self): + model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True) + processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf") + processor.tokenizer.padding_side = "left" + + image = Image.open(requests.get("https://picsum.photos/id/237/50/50", stream=True).raw) + image_2 = Image.open(requests.get("https://picsum.photos/id/247/50/50", stream=True).raw) + prompts = [ + "USER: Describe what do you see here? ASSISTANT:", + "USER: What can you say about the image? ASSISTANT:", + ] + + inputs = processor(images=[image, image_2], text=prompts, padding=True, return_tensors="pt").to( + model.device, torch.float16 + ) + + # greedy generation outputs + EXPECTED_TEXT_COMPLETIONS = Expectations( + { + ("xpu", 3): [ + "USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and its head is lowered, suggesting a state of alertness or readiness. The animal's", + "USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a gently sloping hill with a reddish-brown hue,", + ], + ("cuda", 7): [ + "USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and curved, with its head lowered and ears pointed forward, suggesting alertness or focus.", + "USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a steep, reddish-brown cliff, which could be a", + ], + } + ) # fmt: skip + EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation() + + generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False) + text = processor.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, text) + + @slow + @require_bitsandbytes + @require_torch_large_accelerator + def test_model_generation_multi_image(self): + model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True) + processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf") + + image = Image.open(requests.get("https://picsum.photos/id/237/50/50", stream=True).raw) + image_2 = Image.open(requests.get("https://picsum.photos/id/247/50/50", stream=True).raw) + prompt = "USER: What do these two images have in common? ASSISTANT:" + + inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.float16) + + # greedy generation outputs + EXPECTED_TEXT_COMPLETIONS = Expectations( + { + ("xpu", 3): ['USER: 64*6464*64What do these two images have in common? ASSISTANT: The two images both depict a rhinoceros, yet they are significantly different in terms of focus and clarity. The rhinoceros in the upper image is in sharp focus, showing detailed textures'], + ("cuda", 7): ["USER: 64*6464*64What do these two images have in common? ASSISTANT: Both images feature a black animal, but they are not the same animal. The top image shows a close-up of a black cow's head, while the bottom image depicts a black cow in a natural"], + } + ) # fmt: skip + EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation() + generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False) + text = processor.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, text) + + @slow + @require_bitsandbytes + @require_torch_large_accelerator + def test_model_generate_images(self): + model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Gen-hf", load_in_4bit=True) + processor = Emu3Processor.from_pretrained("BAAI/Emu3-Gen-hf") + + inputs = processor( + text=["a portrait of young girl. masterpiece, film grained, best quality."], + padding=True, + return_tensors="pt", + return_for_image_generation=True, + image_area=1600, + ).to(model.device) + self.assertTrue(inputs.input_ids.shape[1] == 21) + + image_sizes = inputs.pop("image_sizes") + HEIGHT, WIDTH = image_sizes[0] + VISUAL_TOKENS = model.vocabulary_mapping.image_tokens + + def prefix_allowed_tokens_fn(batch_id, input_ids): + height, width = HEIGHT, WIDTH + visual_tokens = VISUAL_TOKENS + image_wrapper_token_id = torch.tensor([processor.tokenizer.image_wrapper_token_id], device=model.device) + eoi_token_id = torch.tensor([processor.tokenizer.eoi_token_id], device=model.device) + eos_token_id = torch.tensor([processor.tokenizer.eos_token_id], device=model.device) + pad_token_id = torch.tensor([processor.tokenizer.pad_token_id], device=model.device) + eof_token_id = torch.tensor([processor.tokenizer.eof_token_id], device=model.device) + eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0] + + position = torch.nonzero(input_ids == image_wrapper_token_id, as_tuple=True)[0][0] + offset = input_ids.shape[0] - position + if offset % (width + 1) == 0: + return (eol_token_id,) + elif offset == (width + 1) * height + 1: + return (eof_token_id,) + elif offset == (width + 1) * height + 2: + return (eoi_token_id,) + elif offset == (width + 1) * height + 3: + return (eos_token_id,) + elif offset > (width + 1) * height + 3: + return (pad_token_id,) + else: + return visual_tokens + + out = model.generate( + **inputs, + max_new_tokens=200, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + do_sample=False, + ) + self.assertTrue(out.shape[1] == 54) + + image = model.decode_image_tokens(out[:, inputs.input_ids.shape[1] :], height=HEIGHT, width=WIDTH) + images = processor.postprocess(list(image.float()), return_tensors="np") + self.assertTrue(images["pixel_values"].shape == (3, 40, 40)) + self.assertTrue(isinstance(images["pixel_values"], np.ndarray)) + + filepath = hf_hub_download( + repo_id="raushan-testing-hf/images_test", + filename="emu3_image.npy", + repo_type="dataset", + ) + original_pixels = np.load(filepath) + self.assertTrue(np.allclose(original_pixels, images["pixel_values"])) diff --git a/docs/transformers/tests/models/emu3/test_processor_emu3.py b/docs/transformers/tests/models/emu3/test_processor_emu3.py new file mode 100644 index 0000000000000000000000000000000000000000..c595a91ee99ff802ba9ba4940e6e6d6875717b16 --- /dev/null +++ b/docs/transformers/tests/models/emu3/test_processor_emu3.py @@ -0,0 +1,92 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch emu3 model.""" + +import tempfile +import unittest + +import numpy as np + +from transformers import Emu3Processor, GPT2TokenizerFast +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import Emu3ImageProcessor + + +class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = Emu3Processor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + image_processor = Emu3ImageProcessor(min_pixels=28 * 28, max_pixels=56 * 56) + extra_special_tokens = extra_special_tokens = { + "image_token": "", + "boi_token": "<|image start|>", + "eoi_token": "<|image end|>", + "image_wrapper_token": "<|image token|>", + "eof_token": "<|extra_201|>", + } + tokenizer = GPT2TokenizerFast.from_pretrained( + "openai-community/gpt2", extra_special_tokens=extra_special_tokens + ) + tokenizer.pad_token_id = 0 + tokenizer.sep_token_id = 1 + processor = cls.processor_class( + image_processor=image_processor, tokenizer=tokenizer, chat_template="dummy_template" + ) + processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token + + @staticmethod + def prepare_processor_dict(): + return { + "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}", + } # fmt: skip + + def test_processor_for_generation(self): + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) + + # we don't need an image as input because the model will generate one + input_str = "lower newer" + image_input = self.prepare_image_inputs() + inputs = processor(text=input_str, return_for_image_generation=True, return_tensors="pt") + self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "image_sizes"]) + self.assertEqual(inputs[self.text_input_name].shape[-1], 8) + + # when `return_for_image_generation` is set, we raise an error that image should not be provided + with self.assertRaises(ValueError): + inputs = processor( + text=input_str, images=image_input, return_for_image_generation=True, return_tensors="pt" + ) + + def test_processor_postprocess(self): + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) + + input_str = "lower newer" + orig_image_input = self.prepare_image_inputs() + orig_image = np.array(orig_image_input).transpose(2, 0, 1) + + inputs = processor(text=input_str, images=orig_image, do_resize=False, return_tensors="np") + normalized_image_input = inputs.pixel_values + unnormalized_images = processor.postprocess(normalized_image_input, return_tensors="np")["pixel_values"] + + # For an image where pixels go from 0 to 255 the diff can be 1 due to some numerical precision errors when scaling and unscaling + self.assertTrue(np.abs(orig_image - unnormalized_images).max() >= 1) diff --git a/docs/transformers/tests/models/encodec/__init__.py b/docs/transformers/tests/models/encodec/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/encodec/test_feature_extraction_encodec.py b/docs/transformers/tests/models/encodec/test_feature_extraction_encodec.py new file mode 100644 index 0000000000000000000000000000000000000000..3dc4c5fbb7ca1142ae7b030562e25abe4a4787fb --- /dev/null +++ b/docs/transformers/tests/models/encodec/test_feature_extraction_encodec.py @@ -0,0 +1,252 @@ +# Copyright 2021-2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for the EnCodec feature extractor.""" + +import itertools +import random +import unittest + +import numpy as np + +from transformers import EncodecFeatureExtractor +from transformers.testing_utils import require_torch +from transformers.utils.import_utils import is_torch_available + +from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin + + +if is_torch_available(): + import torch + + +global_rng = random.Random() + + +# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list +def floats_list(shape, scale=1.0, rng=None, name=None): + """Creates a random float32 tensor""" + if rng is None: + rng = global_rng + + values = [] + for batch_idx in range(shape[0]): + values.append([]) + for _ in range(shape[1]): + values[-1].append(rng.random() * scale) + + return values + + +@require_torch +class EnCodecFeatureExtractionTester: + def __init__( + self, + parent, + batch_size=7, + min_seq_length=400, + max_seq_length=2000, + feature_size=1, + padding_value=0.0, + sampling_rate=24000, + return_attention_mask=True, + ): + self.parent = parent + self.batch_size = batch_size + self.min_seq_length = min_seq_length + self.max_seq_length = max_seq_length + self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1) + self.feature_size = feature_size + self.padding_value = padding_value + self.sampling_rate = sampling_rate + self.return_attention_mask = return_attention_mask + + def prepare_feat_extract_dict(self): + return { + "feature_size": self.feature_size, + "padding_value": self.padding_value, + "sampling_rate": self.sampling_rate, + "return_attention_mask": self.return_attention_mask, + } + + def prepare_inputs_for_common(self, equal_length=False, numpify=False): + def _flatten(list_of_lists): + return list(itertools.chain(*list_of_lists)) + + if equal_length: + audio_inputs = floats_list((self.batch_size, self.max_seq_length)) + else: + # make sure that inputs increase in size + audio_inputs = [ + _flatten(floats_list((x, self.feature_size))) + for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff) + ] + + if numpify: + audio_inputs = [np.asarray(x) for x in audio_inputs] + + return audio_inputs + + +@require_torch +class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): + feature_extraction_class = EncodecFeatureExtractor + + def setUp(self): + self.feat_extract_tester = EnCodecFeatureExtractionTester(self) + + def test_call(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + # create three inputs of length 800, 1000, and 1200 + audio_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + np_audio_inputs = [np.asarray(audio_input) for audio_input in audio_inputs] + + # Test not batched input + encoded_sequences_1 = feat_extract(audio_inputs[0], return_tensors="np").input_values + encoded_sequences_2 = feat_extract(np_audio_inputs[0], return_tensors="np").input_values + self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) + + # Test batched + encoded_sequences_1 = feat_extract(audio_inputs, padding=True, return_tensors="np").input_values + encoded_sequences_2 = feat_extract(np_audio_inputs, padding=True, return_tensors="np").input_values + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + def test_double_precision_pad(self): + feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + np_audio_inputs = np.random.rand(100).astype(np.float64) + py_audio_inputs = np_audio_inputs.tolist() + + for inputs in [py_audio_inputs, np_audio_inputs]: + np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np") + self.assertTrue(np_processed.input_values.dtype == np.float32) + pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt") + self.assertTrue(pt_processed.input_values.dtype == torch.float32) + + def _load_datasamples(self, num_samples): + from datasets import load_dataset + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + + return [x["array"] for x in audio_samples] + + def test_integration(self): + # fmt: off + EXPECTED_INPUT_VALUES = torch.tensor( + [2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03, + 3.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03, + 2.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04, + 4.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03, + 7.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04, + 4.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03] + ) + # fmt: on + input_audio = self._load_datasamples(1) + feature_extractor = EncodecFeatureExtractor() + input_values = feature_extractor(input_audio, return_tensors="pt").input_values + self.assertEqual(input_values.shape, (1, 1, 93680)) + torch.testing.assert_close(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, rtol=1e-6, atol=1e-6) + + def test_integration_stereo(self): + # fmt: off + EXPECTED_INPUT_VALUES = torch.tensor( + [2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03, + 3.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03, + 2.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04, + 4.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03, + 7.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04, + 4.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03] + ) + # fmt: on + input_audio = self._load_datasamples(1) + input_audio = [np.tile(input_audio[0][None], reps=(2, 1))] + input_audio[0][1] *= 0.5 + feature_extractor = EncodecFeatureExtractor(feature_size=2) + input_values = feature_extractor(input_audio, return_tensors="pt").input_values + self.assertEqual(input_values.shape, (1, 2, 93680)) + torch.testing.assert_close(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, rtol=1e-6, atol=1e-6) + torch.testing.assert_close(input_values[0, 1, :30], EXPECTED_INPUT_VALUES * 0.5, rtol=1e-6, atol=1e-6) + + def test_truncation_and_padding(self): + input_audio = self._load_datasamples(2) + # would be easier if the stride was like + feature_extractor = EncodecFeatureExtractor(feature_size=1, chunk_length_s=1, overlap=0.01) + + # pad and trunc raise an error ? + with self.assertRaisesRegex( + ValueError, + "^Both padding and truncation were set. Make sure you only set one.$", + ): + truncated_outputs = feature_extractor( + input_audio, padding="max_length", truncation=True, return_tensors="pt" + ).input_values + + # truncate to chunk + truncated_outputs = feature_extractor(input_audio, truncation=True, return_tensors="pt").input_values + self.assertEqual(truncated_outputs.shape, (2, 1, 71520)) # 2 chunks + + # force truncate to max_length + truncated_outputs = feature_extractor( + input_audio, truncation=True, max_length=48000, return_tensors="pt" + ).input_values + self.assertEqual(truncated_outputs.shape, (2, 1, 48000)) + + # pad to chunk + padded_outputs = feature_extractor(input_audio, padding=True, return_tensors="pt").input_values + self.assertEqual(padded_outputs.shape, (2, 1, 95280)) + + # pad to chunk + truncated_outputs = feature_extractor(input_audio, return_tensors="pt").input_values + self.assertEqual(truncated_outputs.shape, (2, 1, 95280)) + + # force pad to max length + truncated_outputs = feature_extractor( + input_audio, padding="max_length", max_length=100000, return_tensors="pt" + ).input_values + self.assertEqual(truncated_outputs.shape, (2, 1, 100000)) + + # force no pad + with self.assertRaisesRegex( + ValueError, + "^Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.$", + ): + truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values + + truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values + self.assertEqual(truncated_outputs.shape, (1, 1, 93680)) + + # no pad if no chunk_length_s + feature_extractor.chunk_length_s = None + with self.assertRaisesRegex( + ValueError, + "^Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.$", + ): + truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values + + truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values + self.assertEqual(truncated_outputs.shape, (1, 1, 93680)) + + # no pad if no overlap + feature_extractor.chunk_length_s = 2 + feature_extractor.overlap = None + with self.assertRaisesRegex( + ValueError, + "^Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.$", + ): + truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values + + truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values + self.assertEqual(truncated_outputs.shape, (1, 1, 93680)) diff --git a/docs/transformers/tests/models/encodec/test_modeling_encodec.py b/docs/transformers/tests/models/encodec/test_modeling_encodec.py new file mode 100644 index 0000000000000000000000000000000000000000..bb6458bbc3f387226ec28d46625ae0372b2e5f43 --- /dev/null +++ b/docs/transformers/tests/models/encodec/test_modeling_encodec.py @@ -0,0 +1,635 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Encodec model.""" + +import copy +import inspect +import os +import tempfile +import unittest + +import numpy as np +from datasets import Audio, load_dataset + +from transformers import AutoProcessor, EncodecConfig +from transformers.testing_utils import ( + is_torch_available, + require_torch, + slow, + torch_device, +) + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import EncodecFeatureExtractor, EncodecModel + + +def prepare_inputs_dict( + config, + input_ids=None, + input_values=None, + decoder_input_ids=None, + attention_mask=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, +): + if input_ids is not None: + encoder_dict = {"input_ids": input_ids} + else: + encoder_dict = {"input_values": input_values} + + decoder_dict = {"decoder_input_ids": decoder_input_ids} if decoder_input_ids is not None else {} + + return {**encoder_dict, **decoder_dict} + + +@require_torch +class EncodecModelTester: + def __init__( + self, + parent, + # `batch_size` needs to be an even number if the model has some outputs with batch dim != 0. + batch_size=12, + num_channels=2, + is_training=False, + intermediate_size=40, + hidden_size=32, + num_filters=8, + num_residual_layers=1, + upsampling_ratios=[8, 4], + num_lstm_layers=1, + codebook_size=64, + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.is_training = is_training + self.intermediate_size = intermediate_size + self.hidden_size = hidden_size + self.num_filters = num_filters + self.num_residual_layers = num_residual_layers + self.upsampling_ratios = upsampling_ratios + self.num_lstm_layers = num_lstm_layers + self.codebook_size = codebook_size + + def prepare_config_and_inputs(self): + input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0) + config = self.get_config() + inputs_dict = {"input_values": input_values} + return config, inputs_dict + + def prepare_config_and_inputs_for_common(self): + config, inputs_dict = self.prepare_config_and_inputs() + return config, inputs_dict + + def prepare_config_and_inputs_for_model_class(self, model_class): + config, inputs_dict = self.prepare_config_and_inputs() + inputs_dict["audio_codes"] = ids_tensor([1, self.batch_size, 1, self.num_channels], self.codebook_size).type( + torch.int32 + ) + inputs_dict["audio_scales"] = [None] + + return config, inputs_dict + + def prepare_config_and_inputs_for_normalization(self): + input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0) + config = self.get_config() + config.normalize = True + + processor = EncodecFeatureExtractor(feature_size=config.audio_channels, sampling_rate=config.sampling_rate) + input_values = input_values.tolist() + inputs_dict = processor( + input_values, sampling_rate=config.sampling_rate, padding=True, return_tensors="pt" + ).to(torch_device) + + return config, inputs_dict + + def get_config(self): + return EncodecConfig( + audio_channels=self.num_channels, + chunk_in_sec=None, + hidden_size=self.hidden_size, + num_filters=self.num_filters, + num_residual_layers=self.num_residual_layers, + upsampling_ratios=self.upsampling_ratios, + num_lstm_layers=self.num_lstm_layers, + codebook_size=self.codebook_size, + ) + + def create_and_check_model_forward(self, config, inputs_dict): + model = EncodecModel(config=config).to(torch_device).eval() + result = model(**inputs_dict) + self.parent.assertEqual( + result.audio_values.shape, (self.batch_size, self.num_channels, self.intermediate_size) + ) + + +@require_torch +class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (EncodecModel,) if is_torch_available() else () + is_encoder_decoder = True + test_pruning = False + test_headmasking = False + test_resize_embeddings = False + pipeline_model_mapping = {"feature-extraction": EncodecModel} if is_torch_available() else {} + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + # model does not have attention and does not support returning hidden states + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + if "output_attentions" in inputs_dict: + inputs_dict.pop("output_attentions") + if "output_hidden_states" in inputs_dict: + inputs_dict.pop("output_hidden_states") + return inputs_dict + + def setUp(self): + self.model_tester = EncodecModelTester(self) + self.config_tester = ConfigTester( + self, config_class=EncodecConfig, hidden_size=37, common_properties=[], has_text_modality=False + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model_forward(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_forward(*config_and_inputs) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["input_values", "padding_mask", "bandwidth"] + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + @unittest.skip(reason="The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip( + reason="The EncodecModel is not transformers based, thus it does not have the usual `attention` logic" + ) + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip( + reason="The EncodecModel is not transformers based, thus it does not have the usual `attention` logic" + ) + def test_torchscript_output_attentions(self): + pass + + @unittest.skip( + reason="The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic" + ) + def test_torchscript_output_hidden_state(self): + pass + + def _create_and_check_torchscript(self, config, inputs_dict): + if not self.test_torchscript: + self.skipTest(reason="test_torchscript is set to False") + + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.torchscript = True + configs_no_init.return_dict = False + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + inputs = self._prepare_for_class(inputs_dict, model_class) + + main_input_name = model_class.main_input_name + + try: + main_input = inputs[main_input_name] + model(main_input) + traced_model = torch.jit.trace(model, main_input) + except RuntimeError: + self.fail("Couldn't trace module.") + + with tempfile.TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") + + try: + torch.jit.save(traced_model, pt_file_name) + except Exception: + self.fail("Couldn't save module.") + + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load module.") + + model.to(torch_device) + model.eval() + + loaded_model.to(torch_device) + loaded_model.eval() + + model_state_dict = model.state_dict() + loaded_model_state_dict = loaded_model.state_dict() + + non_persistent_buffers = {} + for key in loaded_model_state_dict.keys(): + if key not in model_state_dict.keys(): + non_persistent_buffers[key] = loaded_model_state_dict[key] + + loaded_model_state_dict = { + key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers + } + + self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) + + model_buffers = list(model.buffers()) + for non_persistent_buffer in non_persistent_buffers.values(): + found_buffer = False + for i, model_buffer in enumerate(model_buffers): + if torch.equal(non_persistent_buffer, model_buffer): + found_buffer = True + break + + self.assertTrue(found_buffer) + model_buffers.pop(i) + + model_buffers = list(model.buffers()) + for non_persistent_buffer in non_persistent_buffers.values(): + found_buffer = False + for i, model_buffer in enumerate(model_buffers): + if torch.equal(non_persistent_buffer, model_buffer): + found_buffer = True + break + + self.assertTrue(found_buffer) + model_buffers.pop(i) + + models_equal = True + for layer_name, p1 in model_state_dict.items(): + if layer_name in loaded_model_state_dict: + p2 = loaded_model_state_dict[layer_name] + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + # Avoid memory leak. Without this, each call increase RAM usage by ~20MB. + # (Even with this call, there are still memory leak by ~0.04MB) + self.clear_torch_jit_class_registry() + + @unittest.skip( + reason="The EncodecModel is not transformers based, thus it does not have the usual `attention` logic" + ) + def test_attention_outputs(self): + pass + + def test_feed_forward_chunking(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + torch.manual_seed(0) + config = copy.deepcopy(original_config) + config.chunk_length_s = None + config.overlap = None + config.sampling_rate = 10 + + model = model_class(config) + model.to(torch_device) + model.eval() + inputs = self._prepare_for_class(inputs_dict, model_class) + inputs["input_values"] = inputs["input_values"].repeat(1, 1, 10) + + hidden_states_no_chunk = model(**inputs)[1] + + torch.manual_seed(0) + config.chunk_length_s = 1 + config.overlap = 0 + config.sampling_rate = 10 + + model = model_class(config) + model.to(torch_device) + model.eval() + + hidden_states_with_chunk = model(**inputs)[1] + torch.testing.assert_close(hidden_states_no_chunk, hidden_states_with_chunk, rtol=1e-1, atol=1e-2) + + @unittest.skip( + reason="The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic" + ) + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage(self): + pass + + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_checkpoints(self): + pass + + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") + def test_save_load_low_cpu_mem_usage_no_safetensors(self): + pass + + def test_determinism(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_determinism(first, second): + # outputs are not tensors but list (since each sequence don't have the same frame_length) + out_1 = first.cpu().numpy() + out_2 = second.cpu().numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + first = model(**self._prepare_for_class(inputs_dict, model_class))[0] + second = model(**self._prepare_for_class(inputs_dict, model_class))[0] + + if isinstance(first, tuple) and isinstance(second, tuple): + for tensor1, tensor2 in zip(first, second): + check_determinism(tensor1, tensor2) + else: + check_determinism(first, second) + + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def set_nan_tensor_to_zero(t): + t[t != t] = 0 + return t + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs) + + self.assertTrue(isinstance(tuple_output, tuple)) + self.assertTrue(isinstance(dict_output, dict)) + + for tuple_value, dict_value in zip(tuple_output, dict_output.values()): + self.assertTrue( + torch.allclose( + set_nan_tensor_to_zero(tuple_value), set_nan_tensor_to_zero(dict_value), atol=1e-5 + ), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {torch.max(torch.abs(tuple_value - dict_value))}. Tuple has `nan`:" + f" {torch.isnan(tuple_value).any()} and `inf`: {torch.isinf(tuple_value)}. Dict has" + f" `nan`: {torch.isnan(dict_value).any()} and `inf`: {torch.isinf(dict_value)}." + ), + ) + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + uniform_init_parms = ["conv"] + ignore_init = ["lstm"] + if param.requires_grad: + if any(x in name for x in uniform_init_parms): + self.assertTrue( + -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + elif not any(x in name for x in ignore_init): + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_identity_shortcut(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs() + config.use_conv_shortcut = False + self.model_tester.create_and_check_model_forward(config, inputs_dict) + + def test_model_forward_with_normalization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_normalization() + self.model_tester.create_and_check_model_forward(config, inputs_dict) + + +def normalize(arr): + norm = np.linalg.norm(arr) + normalized_arr = arr / norm + return normalized_arr + + +def compute_rmse(arr1, arr2): + arr1_normalized = normalize(arr1) + arr2_normalized = normalize(arr2) + return np.sqrt(((arr1_normalized - arr2_normalized) ** 2).mean()) + + +@slow +@require_torch +class EncodecIntegrationTest(unittest.TestCase): + def test_integration_24kHz(self): + expected_rmse = { + "1.5": 0.0025, + "24.0": 0.0015, + } + expected_codesums = { + "1.5": [371955], + "24.0": [6659962], + } + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + model_id = "facebook/encodec_24khz" + + model = EncodecModel.from_pretrained(model_id).to(torch_device) + processor = AutoProcessor.from_pretrained(model_id) + + librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) + audio_sample = librispeech_dummy[-1]["audio"]["array"] + + inputs = processor( + raw_audio=audio_sample, + sampling_rate=processor.sampling_rate, + return_tensors="pt", + ).to(torch_device) + + for bandwidth, expected_rmse in expected_rmse.items(): + with torch.no_grad(): + # use max bandwidth for best possible reconstruction + encoder_outputs = model.encode(inputs["input_values"], bandwidth=float(bandwidth)) + + audio_code_sums = [a[0].sum().item() for a in encoder_outputs[0]] + + # make sure audio encoded codes are correct + self.assertListEqual(audio_code_sums, expected_codesums[bandwidth]) + + audio_codes, scales = encoder_outputs.to_tuple() + input_values_dec = model.decode(audio_codes, scales, inputs["padding_mask"])[0] + input_values_enc_dec = model( + inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth) + )[-1] + + # make sure forward and decode gives same result + torch.testing.assert_close(input_values_dec, input_values_enc_dec, rtol=1e-3, atol=1e-3) + + # make sure shape matches + self.assertTrue(inputs["input_values"].shape == input_values_enc_dec.shape) + + arr = inputs["input_values"][0].cpu().numpy() + arr_enc_dec = input_values_enc_dec[0].cpu().numpy() + + # make sure audios are more or less equal + # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0 + rmse = compute_rmse(arr, arr_enc_dec) + self.assertTrue(rmse < expected_rmse) + + def test_integration_48kHz(self): + expected_rmse = { + "3.0": 0.001, + "24.0": 0.0005, + } + expected_codesums = { + "3.0": [144259, 146765, 156435, 176871, 161971], + "24.0": [1568553, 1294948, 1306190, 1464747, 1663150], + } + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + model_id = "facebook/encodec_48khz" + + model = EncodecModel.from_pretrained(model_id).to(torch_device) + model = model.eval() + processor = AutoProcessor.from_pretrained(model_id) + + librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) + audio_sample = librispeech_dummy[-1]["audio"]["array"] + + # transform mono to stereo + audio_sample = np.array([audio_sample, audio_sample]) + + inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt").to( + torch_device + ) + + for bandwidth, expected_rmse in expected_rmse.items(): + with torch.no_grad(): + # use max bandwidth for best possible reconstruction + encoder_outputs = model.encode( + inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth), return_dict=False + ) + audio_code_sums = [a[0].sum().item() for a in encoder_outputs[0]] + + # make sure audio encoded codes are correct + self.assertListEqual(audio_code_sums, expected_codesums[bandwidth]) + audio_codes, scales = encoder_outputs + input_values_dec = model.decode(audio_codes, scales, inputs["padding_mask"])[0] + input_values_enc_dec = model( + inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth) + )[-1] + + # make sure forward and decode gives same result + torch.testing.assert_close(input_values_dec, input_values_enc_dec, rtol=1e-3, atol=1e-3) + + # make sure shape matches + self.assertTrue(inputs["input_values"].shape == input_values_enc_dec.shape) + + arr = inputs["input_values"][0].cpu().numpy() + arr_enc_dec = input_values_enc_dec[0].cpu().numpy() + + # make sure audios are more or less equal + # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0 + rmse = compute_rmse(arr, arr_enc_dec) + self.assertTrue(rmse < expected_rmse) + + def test_batch_48kHz(self): + expected_rmse = { + "3.0": 0.001, + "24.0": 0.0005, + } + expected_codesums = { + "3.0": [ + [72410, 79137, 76694, 90854, 73023, 82980, 72707, 54842], + [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241], + ], + "24.0": [ + [72410, 79137, 76694, 90854, 73023, 82980, 72707, 54842], + [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241], + ], + } + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + model_id = "facebook/encodec_48khz" + + model = EncodecModel.from_pretrained(model_id).to(torch_device) + processor = AutoProcessor.from_pretrained(model_id, chunk_length_s=1, overlap=0.01) + + librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) + + audio_samples = [ + np.array([audio_sample["array"], audio_sample["array"]]) + for audio_sample in librispeech_dummy[-2:]["audio"] + ] + + inputs = processor(raw_audio=audio_samples, sampling_rate=processor.sampling_rate, return_tensors="pt") + input_values = inputs["input_values"].to(torch_device) + for bandwidth, expected_rmse in expected_rmse.items(): + with torch.no_grad(): + # use max bandwidth for best possible reconstruction + encoder_outputs = model.encode(input_values, bandwidth=float(bandwidth), return_dict=False) + audio_code_sums_0 = [a[0][0].sum().item() for a in encoder_outputs[0]] + audio_code_sums_1 = [a[0][1].sum().item() for a in encoder_outputs[0]] + + # make sure audio encoded codes are correct + self.assertListEqual(audio_code_sums_0, expected_codesums[bandwidth][0]) + self.assertListEqual(audio_code_sums_1, expected_codesums[bandwidth][1]) + + audio_codes, scales = encoder_outputs + input_values_dec = model.decode(audio_codes, scales)[0] + input_values_enc_dec = model(input_values, bandwidth=float(bandwidth))[-1] + + # make sure forward and decode gives same result + torch.testing.assert_close(input_values_dec, input_values_enc_dec, rtol=1e-3, atol=1e-3) + + # make sure shape matches + self.assertTrue(input_values.shape == input_values_enc_dec.shape) + + arr = input_values[0].cpu().numpy() + arr_enc_dec = input_values_enc_dec[0].cpu().numpy() + + # make sure audios are more or less equal + # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0 + rmse = compute_rmse(arr, arr_enc_dec) + self.assertTrue(rmse < expected_rmse) diff --git a/docs/transformers/tests/models/encoder_decoder/__init__.py b/docs/transformers/tests/models/encoder_decoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/docs/transformers/tests/models/encoder_decoder/test_modeling_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..18b7d0b61292ad1a380eb95d7b85cf44b402a60d --- /dev/null +++ b/docs/transformers/tests/models/encoder_decoder/test_modeling_encoder_decoder.py @@ -0,0 +1,1285 @@ +# Copyright 2020 HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import tempfile +import unittest + +from transformers import is_torch_available, logging +from transformers.testing_utils import ( + CaptureLogger, + Expectations, + require_deterministic_for_xpu, + require_torch, + require_torch_sdpa, + slow, + torch_device, +) + +from ...test_modeling_common import ids_tensor +from ..bart.test_modeling_bart import BartStandaloneDecoderModelTester +from ..bert.test_modeling_bert import BertModelTester +from ..bert_generation.test_modeling_bert_generation import BertGenerationEncoderTester +from ..gpt2.test_modeling_gpt2 import GPT2ModelTester +from ..prophetnet.test_modeling_prophetnet import ProphetNetStandaloneDecoderModelTester +from ..roberta.test_modeling_roberta import RobertaModelTester + + +if is_torch_available(): + import numpy as np + import torch + + from transformers import ( + AutoConfig, + AutoTokenizer, + BartForCausalLM, + BertGenerationDecoder, + BertGenerationEncoder, + BertLMHeadModel, + BertModel, + BertTokenizer, + EncoderDecoderConfig, + EncoderDecoderModel, + GPT2LMHeadModel, + ProphetNetForCausalLM, + RobertaForCausalLM, + RobertaModel, + ) + from transformers.modeling_outputs import BaseModelOutput + + +@require_torch +class EncoderDecoderMixin: + supports_sdpa = False + + def get_encoder_decoder_model(self, config, decoder_config): + raise NotImplementedError + + def prepare_config_and_inputs(self): + raise NotImplementedError + + def get_pretrained_model(self): + raise NotImplementedError + + def check_encoder_decoder_model_from_pretrained_configs( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) + self.assertTrue(encoder_decoder_config.decoder.is_decoder) + + enc_dec_model = EncoderDecoderModel(encoder_decoder_config) + enc_dec_model.to(torch_device) + enc_dec_model.eval() + + self.assertTrue(enc_dec_model.config.is_encoder_decoder) + + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + def check_encoder_decoder_model( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + self.assertTrue(enc_dec_model.config.decoder.is_decoder) + self.assertTrue(enc_dec_model.config.decoder.add_cross_attention) + self.assertTrue(enc_dec_model.config.is_encoder_decoder) + enc_dec_model.to(torch_device) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + encoder_outputs = BaseModelOutput(last_hidden_state=encoder_hidden_states) + outputs_encoder_decoder = enc_dec_model( + encoder_outputs=encoder_outputs, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + # Test passing encoder_outputs as tuple. + encoder_outputs = (encoder_hidden_states,) + outputs_encoder_decoder = enc_dec_model( + encoder_outputs=encoder_outputs, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + def check_encoder_decoder_model_from_pretrained_using_model_paths( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + with ( + tempfile.TemporaryDirectory() as encoder_tmp_dirname, + tempfile.TemporaryDirectory() as decoder_tmp_dirname, + ): + encoder_model.save_pretrained(encoder_tmp_dirname) + decoder_model.save_pretrained(decoder_tmp_dirname) + model_kwargs = {"encoder_hidden_dropout_prob": 0.0} + + # BartConfig has no hidden_dropout_prob. + if not hasattr(decoder_config, "hidden_dropout_prob"): + model_kwargs["decoder_activation_function"] = "gelu" + else: + model_kwargs["decoder_hidden_dropout_prob"] = 0.0 + + enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained( + encoder_tmp_dirname, decoder_tmp_dirname, **model_kwargs + ) + enc_dec_model.to(torch_device) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + return_dict=True, + ) + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + def check_encoder_decoder_model_from_pretrained( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + return_dict, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} + enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs) + enc_dec_model.to(torch_device) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + return_dict=True, + ) + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + def check_save_and_load( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + enc_dec_model.to(torch_device) + enc_dec_model.eval() + with torch.no_grad(): + outputs = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + out_2 = outputs[0].cpu().numpy() + out_2[np.isnan(out_2)] = 0 + + with tempfile.TemporaryDirectory() as tmpdirname: + enc_dec_model.save_pretrained(tmpdirname) + enc_dec_model = EncoderDecoderModel.from_pretrained(tmpdirname) + enc_dec_model.to(torch_device) + + after_outputs = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + out_1 = after_outputs[0].cpu().numpy() + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + def check_save_and_load_encoder_decoder_model( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + enc_dec_model.to(torch_device) + enc_dec_model.eval() + with torch.no_grad(): + outputs = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + out_2 = outputs[0].cpu().numpy() + out_2[np.isnan(out_2)] = 0 + + with ( + tempfile.TemporaryDirectory() as encoder_tmp_dirname, + tempfile.TemporaryDirectory() as decoder_tmp_dirname, + ): + enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname) + enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname) + enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained( + encoder_pretrained_model_name_or_path=encoder_tmp_dirname, + decoder_pretrained_model_name_or_path=decoder_tmp_dirname, + ) + enc_dec_model.to(torch_device) + + after_outputs = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + out_1 = after_outputs[0].cpu().numpy() + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + def check_encoder_decoder_model_labels( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + labels, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + enc_dec_model.to(torch_device) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + labels=labels, + ) + + loss = outputs_encoder_decoder["loss"] + # check that backprop works + loss.backward() + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + def _check_output_with_attentions( + self, outputs_encoder_decoder, config, input_ids, decoder_config, decoder_input_ids + ): + encoder_attentions = outputs_encoder_decoder["encoder_attentions"] + self.assertEqual(len(encoder_attentions), config.num_hidden_layers) + + self.assertEqual( + encoder_attentions[0].shape[-3:], (config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]) + ) + + decoder_attentions = outputs_encoder_decoder["decoder_attentions"] + num_decoder_layers = ( + decoder_config.num_decoder_layers + if hasattr(decoder_config, "num_decoder_layers") + else decoder_config.num_hidden_layers + ) + self.assertEqual(len(decoder_attentions), num_decoder_layers) + + self.assertEqual( + decoder_attentions[0].shape[-3:], + (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]), + ) + + cross_attentions = outputs_encoder_decoder["cross_attentions"] + self.assertEqual(len(cross_attentions), num_decoder_layers) + + cross_attention_input_seq_len = decoder_input_ids.shape[-1] * ( + 1 + (decoder_config.ngram if hasattr(decoder_config, "ngram") else 0) + ) + self.assertEqual( + cross_attentions[0].shape[-3:], + (decoder_config.num_attention_heads, cross_attention_input_seq_len, input_ids.shape[-1]), + ) + + def check_encoder_decoder_model_output_attentions( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + labels, + **kwargs, + ): + # make the decoder inputs a different shape from the encoder inputs to harden the test + decoder_input_ids = decoder_input_ids[:, :-1] + decoder_attention_mask = decoder_attention_mask[:, :-1] + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + enc_dec_model.to(torch_device) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + output_attentions=True, + ) + self._check_output_with_attentions( + outputs_encoder_decoder, config, input_ids, decoder_config, decoder_input_ids + ) + + def check_encoder_decoder_model_output_attentions_from_config( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + labels, + **kwargs, + ): + # Similar to `check_encoder_decoder_model_output_attentions`, but with `output_attentions` triggered from the + # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded + # from the inner models' configurations. + + decoder_input_ids = decoder_input_ids[:, :-1] + decoder_attention_mask = decoder_attention_mask[:, :-1] + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + enc_dec_model.config.output_attentions = True # model config -> won't work + enc_dec_model.to(torch_device) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + self.assertTrue( + all( + key not in outputs_encoder_decoder + for key in ["encoder_attentions", "decoder_attentions", "cross_attentions"] + ) + ) + + config.output_attentions = True # inner model config -> will work + decoder_config.output_attentions = True + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + enc_dec_model.to(torch_device) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + self._check_output_with_attentions( + outputs_encoder_decoder, config, input_ids, decoder_config, decoder_input_ids + ) + + def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config, **kwargs): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + + # Generate until max length + if hasattr(enc_dec_model.config, "eos_token_id"): + enc_dec_model.config.eos_token_id = None + if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"): + enc_dec_model.config.decoder.eos_token_id = None + if hasattr(enc_dec_model.generation_config, "eos_token_id"): + enc_dec_model.generation_config.eos_token_id = None + enc_dec_model.to(torch_device) + + # Bert does not have a bos token id, so use pad_token_id instead + generated_output = enc_dec_model.generate( + input_ids, + decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id, + max_length=decoder_config.max_length, + ) + self.assertEqual(generated_output.shape, (input_ids.shape[0],) + (decoder_config.max_length,)) + + def create_and_check_encoder_decoder_shared_weights( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + labels, + **kwargs, + ): + torch.manual_seed(0) + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + model.to(torch_device) + model.eval() + # load state dict copies weights but does not tie them + decoder_state_dict = model.decoder._modules[model.decoder.base_model_prefix].state_dict() + model.encoder.load_state_dict(decoder_state_dict, strict=False) + + torch.manual_seed(0) + tied_encoder_model, tied_decoder_model = self.get_encoder_decoder_model(config, decoder_config) + config = EncoderDecoderConfig.from_encoder_decoder_configs( + tied_encoder_model.config, tied_decoder_model.config, tie_encoder_decoder=True + ) + tied_model = EncoderDecoderModel(encoder=tied_encoder_model, decoder=tied_decoder_model, config=config) + tied_model.to(torch_device) + tied_model.eval() + + model_result = model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + + tied_model_result = tied_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + + # check that models has less parameters + self.assertLess(sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())) + random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item() + + # check that outputs are equal + self.assertTrue( + torch.allclose( + model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4 + ) + ) + + # check that outputs after saving and loading are equal + with tempfile.TemporaryDirectory() as tmpdirname: + tied_model.save_pretrained(tmpdirname) + tied_model = EncoderDecoderModel.from_pretrained(tmpdirname) + tied_model.to(torch_device) + tied_model.eval() + + # check that models has less parameters + self.assertLess( + sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters()) + ) + random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item() + + tied_model_result = tied_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + + # check that outputs are equal + self.assertTrue( + torch.allclose( + model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4 + ) + ) + + def test_encoder_decoder_model(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model(**input_ids_dict) + + def test_encoder_decoder_model_from_pretrained_configs(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict) + + def test_encoder_decoder_model_from_pretrained(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=False) + + def test_encoder_decoder_model_from_pretrained_return_dict(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=True) + + def test_encoder_decoder_model_from_pretrained_using_model_paths(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_from_pretrained_using_model_paths(**input_ids_dict, return_dict=False) + + def test_save_and_load_from_pretrained(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_save_and_load(**input_ids_dict) + + def test_save_and_load_from_encoder_decoder_pretrained(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_save_and_load_encoder_decoder_model(**input_ids_dict) + + def test_encoder_decoder_model_labels(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_labels(**input_ids_dict) + + def test_encoder_decoder_model_output_attentions(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_output_attentions(**input_ids_dict) + + def test_encoder_decoder_model_output_attentions_from_config(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_output_attentions_from_config(**input_ids_dict) + + def test_encoder_decoder_model_generate(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_generate(**input_ids_dict) + + def test_encoder_decoder_model_shared_weights(self): + input_ids_dict = self.prepare_config_and_inputs() + self.create_and_check_encoder_decoder_shared_weights(**input_ids_dict) + + def test_training_gradient_checkpointing(self): + inputs_dict = self.prepare_config_and_inputs() + encoder_model, decoder_model = self.get_encoder_decoder_model( + inputs_dict["config"], inputs_dict["decoder_config"] + ) + + model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + + model.config.decoder_start_token_id = 0 + model.config.pad_token_id = 0 + + model_inputs = { + "input_ids": inputs_dict["input_ids"], + "attention_mask": inputs_dict["attention_mask"], + "labels": inputs_dict["labels"], + "decoder_input_ids": inputs_dict["decoder_input_ids"], + } + model_inputs = {k: v.to(torch_device) for k, v in model_inputs.items()} + + loss = model(**model_inputs).loss + loss.backward() + + @slow + @require_deterministic_for_xpu + def test_real_model_save_load_from_pretrained(self): + model_2 = self.get_pretrained_model() + model_2.to(torch_device) + input_ids = ids_tensor([13, 5], model_2.config.encoder.vocab_size) + decoder_input_ids = ids_tensor([13, 1], model_2.config.encoder.vocab_size) + attention_mask = ids_tensor([13, 5], vocab_size=2) + with torch.no_grad(): + outputs = model_2( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + ) + out_2 = outputs[0].cpu().numpy() + out_2[np.isnan(out_2)] = 0 + + with tempfile.TemporaryDirectory() as tmp_dirname: + model_2.save_pretrained(tmp_dirname) + model_1 = EncoderDecoderModel.from_pretrained(tmp_dirname) + model_1.to(torch_device) + + after_outputs = model_1( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + ) + out_1 = after_outputs[0].cpu().numpy() + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + if not self.supports_sdpa: + self.skipTest("SDPA is not supported") + + inputs_dict = self.prepare_config_and_inputs() + encoder_config, decoder_config = inputs_dict["config"], inputs_dict["decoder_config"] + config = EncoderDecoderConfig.from_encoder_decoder_configs( + encoder_config=encoder_config, decoder_config=decoder_config + ) + model = EncoderDecoderModel(config=config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = EncoderDecoderModel.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + # see https://github.com/huggingface/transformers/pull/32238 + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + encoder_attn = "sdpa" if model.encoder._supports_sdpa else "eager" + decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager" + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model_sdpa.encoder.config._attn_implementation == encoder_attn) + self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn) + + # Also test that nothing break if we request SDPA explicitly, when both sub-parts support it. + # If the model supports sdpa (i.e. all of sub-models supports it) we'll dispatch safely + # Otherwise we should raise error that SDPA is not supported, as some of the sub-models doesn't support + if encoder_attn == "sdpa" and decoder_attn == "sdpa": + model_sdpa_explicit = EncoderDecoderModel.from_pretrained(tmpdirname, attn_implementation="sdpa") + model_sdpa_explicit = model_sdpa_explicit.eval().to(torch_device) + + self.assertTrue(model_sdpa_explicit.config._attn_implementation == "sdpa") + else: + with self.assertRaises(ValueError): + model_sdpa_explicit = EncoderDecoderModel.from_pretrained(tmpdirname, attn_implementation="sdpa") + + model_eager = EncoderDecoderModel.from_pretrained( + tmpdirname, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.encoder.config._attn_implementation == "eager") + self.assertTrue(model_eager.decoder.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + +@require_torch +class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase): + def get_pretrained_model(self): + return EncoderDecoderModel.from_encoder_decoder_pretrained( + "google-bert/bert-base-cased", "google-bert/bert-base-cased" + ) + + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = BertModel(config) + decoder_model = BertLMHeadModel(decoder_config) + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + model_tester = BertModelTester(self) + encoder_config_and_inputs = model_tester.prepare_config_and_inputs() + decoder_config_and_inputs = model_tester.prepare_config_and_inputs_for_decoder() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_token_type_ids, + decoder_input_mask, + decoder_sequence_labels, + decoder_token_labels, + decoder_choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + return { + "config": config, + "input_ids": input_ids, + "attention_mask": input_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_token_type_ids": decoder_token_type_ids, + "decoder_attention_mask": decoder_input_mask, + "decoder_sequence_labels": decoder_sequence_labels, + "decoder_token_labels": decoder_token_labels, + "decoder_choice_labels": decoder_choice_labels, + "encoder_hidden_states": encoder_hidden_states, + "labels": decoder_token_labels, + } + + def test_relative_position_embeds(self): + config_and_inputs = self.prepare_config_and_inputs() + + encoder_config = config_and_inputs["config"] + decoder_config = config_and_inputs["decoder_config"] + + encoder_config.position_embedding_type = "relative_key_query" + decoder_config.position_embedding_type = "relative_key_query" + + config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config) + model = EncoderDecoderModel(config).eval().to(torch_device) + + logits = model( + input_ids=config_and_inputs["input_ids"], decoder_input_ids=config_and_inputs["decoder_input_ids"] + ).logits + + self.assertTrue(logits.shape, (13, 7)) + + @slow + def test_bert2bert_summarization(self): + model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16") + model.to(torch_device) + tokenizer = BertTokenizer.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16") + + ARTICLE_SIGMA = """(CNN)Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members singing a racist chant. SAE's national chapter suspended the students, but University of Oklahoma President David Boren took it a step further, saying the university's affiliation with the fraternity is permanently done. The news is shocking, but it's not the first time SAE has faced controversy. SAE was founded March 9, 1856, at the University of Alabama, five years before the American Civil War, according to the fraternity website. When the war began, the group had fewer than 400 members, of which "369 went to war for the Confederate States and seven for the Union Army," the website says. The fraternity now boasts more than 200,000 living alumni, along with about 15,000 undergraduates populating 219 chapters and 20 "colonies" seeking full membership at universities. SAE has had to work hard to change recently after a string of member deaths, many blamed on the hazing of new recruits, SAE national President Bradley Cohen wrote in a message on the fraternity's website. The fraternity's website lists more than 130 chapters cited or suspended for "health and safety incidents" since 2010. At least 30 of the incidents involved hazing, and dozens more involved alcohol. However, the list is missing numerous incidents from recent months. Among them, according to various media outlets: Yale University banned the SAEs from campus activities last month after members allegedly tried to interfere with a sexual misconduct investigation connected to an initiation rite. Stanford University in December suspended SAE housing privileges after finding sorority members attending a fraternity function were subjected to graphic sexual content. And Johns Hopkins University in November suspended the fraternity for underage drinking. "The media has labeled us as the 'nation's deadliest fraternity,' " Cohen said. In 2011, for example, a student died while being coerced into excessive alcohol consumption, according to a lawsuit. SAE's previous insurer dumped the fraternity. "As a result, we are paying Lloyd's of London the highest insurance rates in the Greek-letter world," Cohen said. Universities have turned down SAE's attempts to open new chapters, and the fraternity had to close 12 in 18 months over hazing incidents.""" + + ARTICLE_AMERICA = """(CNN) -- The 2013 America's Cup will be faster than ever after organizers announced that wingsail catamarans will be the vessels of choice. The race has historically been between yachts with a single hull, however the 34th edition of the contest will be between multi-hull vessels with wings rather than traditional sails. This means the boats will travel faster through the water, with top speeds in excess of 30 knots, almost three times as fast as in the past. The Golden Gate Yacht Club, hosts of the 2013 race and holders of the cup, have also announced a new, shorter race format for the competition. In an attempt to boost interest in one of sailing's showpiece events an annual World Series will also take place, starting in 2011, resulting a world champion team being crowned. In addition, a youth America's Cup will also be introduced, set to begin in 2012. In a statement on the International Sailing Federation (ISAF) website, the CEO of 2010's winning syndicate BMW ORACLE Racing Russell Coutts explained the reasons behind the changes. "We believe this new format and new boat will put the America's Cup back at the pinnacle of our sport," said Coutts. "These changes will give equal opportunity to competitors and long-term economic stability to all teams and all commercial partners. We promised fairness and innovation and this is what we've delivered." The statement also explained how, in addition to generating interest in the contest, the new annual America's Cup World Series will provide increased commercial revenue for the teams and their sponsors. The venue for the 2013 contest is not due to be announced until the end of the year, with San Francisco, Valencia and a location near Rome believed to be under consideration. Vincenzo Onorato, President of the 2013 challengers Mascalzone Latino, supported the changes: "I think that we need to acknowledge that the Defender has kept its word. The America's Cup is going to have fair rules and a truly independent management of the racing.""" + + EXPECTED_SUMMARY_SIGMA = """sae was founded in 1856, five years before the civil war. the fraternity has had to work hard to change recently. the university of oklahoma president says the university's affiliation with the fraternity is permanently done. the sae has had a string of members in recent months.""" + + EXPECTED_SUMMARY_AMERICA = """the 2013 america's cup will be faster than ever. the 34th edition of the competition will be held in 2011. the 2013 race will be between multi - hull vessels with wings rather than traditional sails. the new america'' cup will provide increased commercial revenue. the event will also be expanded to a youth america'cup.""" + + input_dict = tokenizer( + [ARTICLE_SIGMA, ARTICLE_AMERICA], + padding="max_length", + pad_to_max_length=True, + max_length=512, + return_tensors="pt", + ) + output_ids = model.generate( + input_dict["input_ids"].to(torch_device), attention_mask=input_dict["attention_mask"].to(torch_device) + ) + summary = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + + self.assertEqual(summary, [EXPECTED_SUMMARY_SIGMA, EXPECTED_SUMMARY_AMERICA]) + + def test_bert2bert_default_decoder_attention_mask(self): + torch.manual_seed(0) + test_dict = self.prepare_config_and_inputs() + encoder_config, decoder_config = test_dict["config"], test_dict["decoder_config"] + + encoder_config.pad_token_id = 5 + encoder_config.decoder_start_token_id = 2 + decoder_config.pad_token_id = 5 + decoder_config.decoder_start_token_id = 2 + + config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config) + config.pad_token_id = 5 + config.decoder_start_token_id = 2 + + encoder_model, decoder_model = self.get_encoder_decoder_model(encoder_config, decoder_config) + model = EncoderDecoderModel(config=config, encoder=encoder_model, decoder=decoder_model) + + input_ids = torch.tensor( + [ + [10, 55, 89, 11, 57, 32, 36, 78, 46, 28, 5, 5, 5], + [10, 21, 97, 71, 63, 19, 12, 57, 5, 5, 5, 5, 5], + ] + ) + attention_mask = input_ids.new_tensor(input_ids != 5) + labels = torch.tensor( + [ + [33, 23, 91, 12, 19, 96, 5, 5], + [87, 85, 13, 31, 5, 5, 5, 5], + ] + ) + + logger = logging.get_logger("transformers.modeling_utils") + logger.warning_once.cache_clear() + + with CaptureLogger(logger) as cl: + torch.manual_seed(0) + output = model(input_ids, attention_mask, labels=labels) + + # Assert that the warning does not show up since a default decoder_attention_mask should have been created. + self.assertNotIn("We strongly recommend passing in an `attention_mask`", cl.out) + + # Create a new attention mask that ignores padding, and test that the loss differs for this new attention mask + # and the default attention mask. + attention_mask_ignoring_padding = torch.ones(labels.shape, dtype=torch.long) + torch.manual_seed(0) + ignore_pad_tokens_output = model( + input_ids, attention_mask, labels=labels, decoder_attention_mask=attention_mask_ignoring_padding + ) + self.assertNotAlmostEqual(output.loss.item(), ignore_pad_tokens_output.loss.item()) + + +@require_torch +class BertGenerationEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase): + def get_pretrained_model(self): + return EncoderDecoderModel.from_encoder_decoder_pretrained( + "google/bert_for_seq_generation_L-24_bbc_encoder", "google/bert_for_seq_generation_L-24_bbc_encoder" + ) + + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = BertGenerationEncoder(config) + decoder_model = BertGenerationDecoder(decoder_config) + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + model_tester = BertGenerationEncoderTester(self) + encoder_config_and_inputs = model_tester.prepare_config_and_inputs() + decoder_config_and_inputs = model_tester.prepare_config_and_inputs_for_decoder() + ( + config, + input_ids, + input_mask, + token_labels, + ) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_input_mask, + decoder_token_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + return { + "config": config, + "input_ids": input_ids, + "attention_mask": input_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": decoder_input_mask, + "decoder_token_labels": decoder_token_labels, + "encoder_hidden_states": encoder_hidden_states, + "labels": decoder_token_labels, + } + + @slow + @require_deterministic_for_xpu + def test_roberta2roberta_summarization(self): + model = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_bbc") + model.to(torch_device) + tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_bbc") + + ARTICLE_PS3 = """The problem is affecting people using the older versions of the PlayStation 3, called the "Fat" model.The problem isn't affecting the newer PS3 Slim systems that have been on sale since September last year.Sony have also said they are aiming to have the problem fixed shortly but is advising some users to avoid using their console for the time being."We hope to resolve this problem within the next 24 hours," a statement reads. "In the meantime, if you have a model other than the new slim PS3, we advise that you do not use your PS3 system, as doing so may result in errors in some functionality, such as recording obtained trophies, and not being able to restore certain data."We believe we have identified that this problem is being caused by a bug in the clock functionality incorporated in the system."The PlayStation Network is used by millions of people around the world.It allows users to play their friends at games like Fifa over the internet and also do things like download software or visit online stores.""" + + ARTICLE_TOSHIBA = """An independent panel appointed by Toshiba found institutional accounting irregularities, the firm said in a statement to investors. Toshiba said it "takes the situation it has caused very seriously" and that it "deeply apologised" to shareholders. The overstatement was roughly triple an initial Toshiba estimate. The probe could lead to a restatement of earnings, a board overhaul and potential action by regulators. "Within Toshiba, there was a corporate culture in which one could not go against the wishes of superiors," the report said. "Therefore, when top management presented 'challenges', division presidents, line managers and employees below them continually carried out inappropriate accounting practices to meet targets in line with the wishes of their superiors." The improper accounting practices stretched back to 2008.""" + + # fmt: off + EXPECTED_SUMMARIES_PS3 = Expectations( + { + ("xpu", 3): """Sony has said that a bug in its PlayStation 3 console is preventing them from using the machine as a computer .""", + ("cuda", 7): """Sony has said that a bug in its PlayStation 3 console is preventing them from using the machine as a computer.""", + } + ) # fmt: on + EXPECTED_SUMMARY_PS3 = EXPECTED_SUMMARIES_PS3.get_expectation() + + EXPECTED_SUMMARIES_TOSHIBA = Expectations( + { + ( + "xpu", + 3, + ): """Japanese electronics giant Toshiba overstated its annual earnings by more than a third last year , according to a report .""", + ( + "cuda", + 7, + ): """Japanese electronics giant Toshiba overstated its annual earnings by more than a third last year, according to a report.""", + } + ) + EXPECTED_SUMMARY_TOSHIBA = EXPECTED_SUMMARIES_TOSHIBA.get_expectation() + + input_dict = tokenizer( + [ARTICLE_PS3, ARTICLE_TOSHIBA], max_length=512, padding="max_length", return_tensors="pt" + ) + output_ids = model.generate( + input_dict["input_ids"].to(torch_device), attention_mask=input_dict["attention_mask"].to(torch_device) + ) + summary = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + + self.assertEqual(summary, [EXPECTED_SUMMARY_PS3, EXPECTED_SUMMARY_TOSHIBA]) + + +@require_torch +class RoBertaEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase): + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = RobertaModel(config) + decoder_model = RobertaForCausalLM(decoder_config) + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + model_tester = RobertaModelTester(self) + encoder_config_and_inputs = model_tester.prepare_config_and_inputs() + decoder_config_and_inputs = model_tester.prepare_config_and_inputs_for_decoder() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_token_type_ids, + decoder_input_mask, + decoder_sequence_labels, + decoder_token_labels, + decoder_choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + return { + "config": config, + "input_ids": input_ids, + "attention_mask": input_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_token_type_ids": decoder_token_type_ids, + "decoder_attention_mask": decoder_input_mask, + "decoder_sequence_labels": decoder_sequence_labels, + "decoder_token_labels": decoder_token_labels, + "decoder_choice_labels": decoder_choice_labels, + "encoder_hidden_states": encoder_hidden_states, + "labels": decoder_token_labels, + } + + def get_pretrained_model(self): + return EncoderDecoderModel.from_encoder_decoder_pretrained( + "FacebookAI/roberta-base", "FacebookAI/roberta-base" + ) + + +@require_torch +class GPT2EncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase): + supports_sdpa = True + + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = BertModel(config) + decoder_model = GPT2LMHeadModel(decoder_config) + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + model_tester_encoder = BertModelTester(self, batch_size=13) + model_tester_decoder = GPT2ModelTester(self, batch_size=13) + encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs() + decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_input_mask, + decoder_head_mask, + decoder_token_type_ids, + decoder_sequence_labels, + decoder_token_labels, + decoder_choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + # disable cache for now + decoder_config.use_cache = False + return { + "config": config, + "input_ids": input_ids, + "attention_mask": input_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_token_type_ids": decoder_token_type_ids, + "decoder_attention_mask": decoder_input_mask, + "decoder_sequence_labels": decoder_sequence_labels, + "decoder_token_labels": decoder_token_labels, + "decoder_choice_labels": decoder_choice_labels, + "encoder_hidden_states": encoder_hidden_states, + "labels": decoder_token_labels, + } + + def get_pretrained_model(self): + return EncoderDecoderModel.from_encoder_decoder_pretrained( + "google-bert/bert-base-cased", "openai-community/gpt2" + ) + + @unittest.skip + def test_encoder_decoder_model_shared_weights(self): + pass + + @slow + @require_deterministic_for_xpu + def test_bert2gpt2_summarization(self): + model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16") + + model.to(torch_device) + tokenizer_in = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") + tokenizer_out = AutoTokenizer.from_pretrained("openai-community/gpt2") + + ARTICLE_STUDENTS = """(CNN)Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members singing a racist chant. SAE's national chapter suspended the students, but University of Oklahoma President David Boren took it a step further, saying the university's affiliation with the fraternity is permanently done. The news is shocking, but it's not the first time SAE has faced controversy. SAE was founded March 9, 1856, at the University of Alabama, five years before the American Civil War, according to the fraternity website. When the war began, the group had fewer than 400 members, of which "369 went to war for the Confederate States and seven for the Union Army," the website says. The fraternity now boasts more than 200,000 living alumni, along with about 15,000 undergraduates populating 219 chapters and 20 "colonies" seeking full membership at universities. SAE has had to work hard to change recently after a string of member deaths, many blamed on the hazing of new recruits, SAE national President Bradley Cohen wrote in a message on the fraternity's website. The fraternity's website lists more than 130 chapters cited or suspended for "health and safety incidents" since 2010. At least 30 of the incidents involved hazing, and dozens more involved alcohol. However, the list is missing numerous incidents from recent months. Among them, according to various media outlets: Yale University banned the SAEs from campus activities last month after members allegedly tried to interfere with a sexual misconduct investigation connected to an initiation rite. Stanford University in December suspended SAE housing privileges after finding sorority members attending a fraternity function were subjected to graphic sexual content. And Johns Hopkins University in November suspended the fraternity for underage drinking. "The media has labeled us as the 'nation's deadliest fraternity,' " Cohen said. In 2011, for example, a student died while being coerced into excessive alcohol consumption, according to a lawsuit. SAE's previous insurer dumped the fraternity. "As a result, we are paying Lloyd's of London the highest insurance rates in the Greek-letter world," Cohen said. Universities have turned down SAE's attempts to open new chapters, and the fraternity had to close 12 in 18 months over hazing incidents.""" + + EXPECTED_SUMMARIES_STUDENTS = Expectations( + { + ( + "xpu", + 3, + ): """SAS Alpha Epsilon suspended the students, but university president says it's permanent .\nThe fraternity has had to deal with a string of student deaths since 2010 .\nSAS has more than 200,000 members, many of whom are students .\nA student died while being forced into excessive alcohol consumption .""", + ( + "cuda", + 7, + ): """SAS Alpha Epsilon suspended the students, but university president says it's permanent.\nThe fraternity has had to deal with a string of student deaths since 2010.\nSAS has more than 200,000 members, many of whom are students.\nA student died while being forced into excessive alcohol consumption.""", + } + ) + EXPECTED_SUMMARY_STUDENTS = EXPECTED_SUMMARIES_STUDENTS.get_expectation() + + input_dict = tokenizer_in(ARTICLE_STUDENTS, return_tensors="pt") + output_ids = model.generate(input_dict["input_ids"].to(torch_device)) + summary = tokenizer_out.batch_decode(output_ids, skip_special_tokens=True) + + self.assertEqual(summary, [EXPECTED_SUMMARY_STUDENTS]) + + +@require_torch +class ProphetNetEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase): + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = BertModel(config) + decoder_model = ProphetNetForCausalLM(decoder_config) + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + model_tester_encoder = BertModelTester(self, batch_size=13) + model_tester_decoder = ProphetNetStandaloneDecoderModelTester( + self, batch_size=13, hidden_size=32, max_position_embeddings=512 + ) + encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs() + decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_attention_mask, + encoder_hidden_states, + encoder_attention_mask, + lm_labels, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + # disable cache for now + decoder_config.use_cache = False + return { + "config": config, + "input_ids": input_ids, + "attention_mask": input_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": decoder_attention_mask, + "encoder_hidden_states": encoder_hidden_states, + "labels": lm_labels, + } + + def get_pretrained_model(self): + return EncoderDecoderModel.from_encoder_decoder_pretrained( + "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased" + ) + + @unittest.skip + def test_encoder_decoder_model_shared_weights(self): + pass + + +@require_torch +class BartEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase): + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = BertModel(config) + decoder_model = BartForCausalLM(decoder_config) + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + model_tester_encoder = BertModelTester(self, batch_size=13) + model_tester_decoder = BartStandaloneDecoderModelTester( + self, batch_size=13, d_model=32, max_position_embeddings=512 + ) + encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs() + decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_attention_mask, + encoder_hidden_states, + encoder_attention_mask, + lm_labels, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + # disable cache for now + decoder_config.use_cache = False + return { + "config": config, + "input_ids": input_ids, + "attention_mask": input_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": decoder_attention_mask, + "encoder_hidden_states": encoder_hidden_states, + "labels": lm_labels, + } + + def get_pretrained_model(self): + return EncoderDecoderModel.from_encoder_decoder_pretrained( + "google-bert/bert-large-uncased", "facebook/bart-large" + ) + + @unittest.skip + def test_encoder_decoder_model_shared_weights(self): + pass + + +@require_torch +class EncoderDecoderModelTest(unittest.TestCase): + def get_from_encoderdecoder_pretrained_model(self): + return EncoderDecoderModel.from_encoder_decoder_pretrained( + "google-bert/bert-base-uncased", "google-bert/bert-base-uncased" + ) + + def get_decoder_config(self): + config = AutoConfig.from_pretrained("google-bert/bert-base-uncased") + config.is_decoder = True + config.add_cross_attention = True + return config + + def get_encoderdecoder_model(self): + return EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16") + + def get_encoder_decoder_models(self): + encoder_model = BertModel.from_pretrained("google-bert/bert-base-uncased") + decoder_model = BertLMHeadModel.from_pretrained( + "google-bert/bert-base-uncased", config=self.get_decoder_config() + ) + return {"encoder": encoder_model, "decoder": decoder_model} + + def _check_configuration_tie(self, model): + assert id(model.decoder.config) == id(model.config.decoder) + assert id(model.encoder.config) == id(model.config.encoder) + + @slow + def test_configuration_tie(self): + model = self.get_from_encoderdecoder_pretrained_model() + self._check_configuration_tie(model) + + model = EncoderDecoderModel(**self.get_encoder_decoder_models()) + self._check_configuration_tie(model) + + model = self.get_encoderdecoder_model() + self._check_configuration_tie(model) diff --git a/docs/transformers/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py b/docs/transformers/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..b17f9ed37bc6ce6467408ec1429bbf8682c6b58b --- /dev/null +++ b/docs/transformers/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py @@ -0,0 +1,498 @@ +# Copyright 2020 HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import tempfile +import unittest + +import numpy as np + +from transformers import is_flax_available +from transformers.testing_utils import require_flax, slow + +from ...test_modeling_flax_common import ids_tensor +from ..bart.test_modeling_flax_bart import FlaxBartStandaloneDecoderModelTester +from ..bert.test_modeling_flax_bert import FlaxBertModelTester +from ..gpt2.test_modeling_flax_gpt2 import FlaxGPT2ModelTester + + +if is_flax_available(): + from transformers import ( + AutoTokenizer, + EncoderDecoderConfig, + FlaxBartForCausalLM, + FlaxBertForCausalLM, + FlaxBertModel, + FlaxEncoderDecoderModel, + FlaxGPT2LMHeadModel, + ) + + +@require_flax +class FlaxEncoderDecoderMixin: + def get_encoder_decoder_model(self, config, decoder_config): + raise NotImplementedError + + def prepare_config_and_inputs(self): + raise NotImplementedError + + def get_pretrained_model(self): + raise NotImplementedError + + def check_encoder_decoder_model_from_pretrained_configs( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) + self.assertTrue(encoder_decoder_config.decoder.is_decoder) + + enc_dec_model = FlaxEncoderDecoderModel(encoder_decoder_config) + + self.assertTrue(enc_dec_model.config.is_encoder_decoder) + + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + def check_encoder_decoder_model_from_pretrained( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + return_dict, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} + enc_dec_model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + return_dict=True, + ) + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + def check_save_and_load( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model} + enc_dec_model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs) + + outputs = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + out_2 = np.array(outputs[0]) + out_2[np.isnan(out_2)] = 0 + + with tempfile.TemporaryDirectory() as tmpdirname: + enc_dec_model.save_pretrained(tmpdirname) + FlaxEncoderDecoderModel.from_pretrained(tmpdirname) + + after_outputs = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + out_1 = np.array(after_outputs[0]) + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + def check_encoder_decoder_model_from_encoder_decoder_pretrained( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + # assert that model attributes match those of configs + self.assertEqual(config.use_cache, encoder_model.config.use_cache) + self.assertEqual(decoder_config.use_cache, decoder_model.config.use_cache) + + with tempfile.TemporaryDirectory() as enc_tmpdir: + with tempfile.TemporaryDirectory() as dec_tmpdir: + encoder_model.save_pretrained(enc_tmpdir) + decoder_model.save_pretrained(dec_tmpdir) + # load a model from pretrained encoder and decoder checkpoints, setting one encoder and one decoder kwarg opposite to that specified in their respective configs + enc_dec_model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained( + encoder_pretrained_model_name_or_path=enc_tmpdir, + decoder_pretrained_model_name_or_path=dec_tmpdir, + encoder_use_cache=not config.use_cache, + decoder_use_cache=not decoder_config.use_cache, + ) + + # assert that setting encoder and decoder kwargs opposite to those in the configs has correctly been applied + self.assertNotEqual(config.use_cache, enc_dec_model.config.encoder.use_cache) + self.assertNotEqual(decoder_config.use_cache, enc_dec_model.config.decoder.use_cache) + + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_hidden_states=True, + return_dict=True, + ) + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + + def check_encoder_decoder_model_output_attentions( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + # make the decoder inputs a different shape from the encoder inputs to harden the test + decoder_input_ids = decoder_input_ids[:, :-1] + decoder_attention_mask = decoder_attention_mask[:, :-1] + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model} + enc_dec_model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + output_attentions=True, + ) + + encoder_attentions = outputs_encoder_decoder["encoder_attentions"] + self.assertEqual(len(encoder_attentions), config.num_hidden_layers) + + self.assertEqual( + encoder_attentions[0].shape[-3:], (config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]) + ) + + decoder_attentions = outputs_encoder_decoder["decoder_attentions"] + num_decoder_layers = ( + decoder_config.num_decoder_layers + if hasattr(decoder_config, "num_decoder_layers") + else decoder_config.num_hidden_layers + ) + self.assertEqual(len(decoder_attentions), num_decoder_layers) + + self.assertEqual( + decoder_attentions[0].shape[-3:], + (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]), + ) + + cross_attentions = outputs_encoder_decoder["cross_attentions"] + self.assertEqual(len(cross_attentions), num_decoder_layers) + + cross_attention_input_seq_len = decoder_input_ids.shape[-1] * ( + 1 + (decoder_config.ngram if hasattr(decoder_config, "ngram") else 0) + ) + self.assertEqual( + cross_attentions[0].shape[-3:], + (decoder_config.num_attention_heads, cross_attention_input_seq_len, input_ids.shape[-1]), + ) + + def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config, **kwargs): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model} + enc_dec_model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs) + + pad_token_id = enc_dec_model.config.decoder.pad_token_id + eos_token_id = enc_dec_model.config.decoder.eos_token_id + decoder_start_token_id = enc_dec_model.config.decoder.decoder_start_token_id + + # Copied from generation.utils (GPT2 doesn't have `pad_token_id`) + if pad_token_id is None and eos_token_id is not None: + pad_token_id = eos_token_id + if decoder_start_token_id is None: + decoder_start_token_id = enc_dec_model.config.decoder.bos_token_id + + # Bert does not have a bos token id, so use pad_token_id instead + # Copied from `test_modeling_encoder_decoder.py` + if decoder_start_token_id is None: + decoder_start_token_id = pad_token_id + + generated_output = enc_dec_model.generate( + input_ids, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + ) + generated_sequences = generated_output.sequences + self.assertEqual(generated_sequences.shape, (input_ids.shape[0],) + (decoder_config.max_length,)) + + def test_encoder_decoder_model_from_pretrained_configs(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict) + + def test_encoder_decoder_model_from_pretrained(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=False) + + def test_encoder_decoder_model_from_pretrained_return_dict(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=True) + + def test_save_and_load_from_pretrained(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_save_and_load(**input_ids_dict) + + def test_encoder_decoder_model_from_encoder_decoder_pretrained(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_from_encoder_decoder_pretrained(**input_ids_dict) + + def test_encoder_decoder_model_output_attentions(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_output_attentions(**input_ids_dict) + + def test_encoder_decoder_model_generate(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_generate(**input_ids_dict) + + def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float): + diff = np.abs(a - b).max() + self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).") + + @slow + def test_real_model_save_load_from_pretrained(self): + model_2 = self.get_pretrained_model() + input_ids = ids_tensor([13, 5], model_2.config.encoder.vocab_size) + decoder_input_ids = ids_tensor([13, 1], model_2.config.encoder.vocab_size) + attention_mask = ids_tensor([13, 5], vocab_size=2) + + outputs = model_2( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + ) + out_2 = np.array(outputs[0]) + out_2[np.isnan(out_2)] = 0 + + with tempfile.TemporaryDirectory() as tmp_dirname: + model_2.save_pretrained(tmp_dirname) + model_1 = FlaxEncoderDecoderModel.from_pretrained(tmp_dirname) + + after_outputs = model_1( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + ) + out_1 = np.array(after_outputs[0]) + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + +@require_flax +class FlaxGPT2EncoderDecoderModelTest(FlaxEncoderDecoderMixin, unittest.TestCase): + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = FlaxBertModel(config) + decoder_model = FlaxGPT2LMHeadModel(decoder_config) + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + model_tester_encoder = FlaxBertModelTester(self, batch_size=13) + model_tester_decoder = FlaxGPT2ModelTester(self, batch_size=13) + encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs() + decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder() + (config, input_ids, token_type_ids, attention_mask) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_attention_mask, + encoder_hidden_states, + encoder_attention_mask, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + return { + "config": config, + "input_ids": input_ids, + "attention_mask": attention_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": decoder_attention_mask, + "encoder_hidden_states": encoder_hidden_states, + } + + def get_pretrained_model(self): + return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained( + "google-bert/bert-base-cased", "openai-community/gpt2" + ) + + @slow + def test_bert2gpt2_summarization(self): + tokenizer_in = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") + tokenizer_out = AutoTokenizer.from_pretrained("openai-community/gpt2") + + model = FlaxEncoderDecoderModel.from_pretrained( + "patrickvonplaten/bert2gpt2-cnn_dailymail-fp16", pad_token_id=tokenizer_out.eos_token_id + ) + + ARTICLE_STUDENTS = """(CNN)Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members singing a racist chant. SAE's national chapter suspended the students, but University of Oklahoma President David Boren took it a step further, saying the university's affiliation with the fraternity is permanently done. The news is shocking, but it's not the first time SAE has faced controversy. SAE was founded March 9, 1856, at the University of Alabama, five years before the American Civil War, according to the fraternity website. When the war began, the group had fewer than 400 members, of which "369 went to war for the Confederate States and seven for the Union Army," the website says. The fraternity now boasts more than 200,000 living alumni, along with about 15,000 undergraduates populating 219 chapters and 20 "colonies" seeking full membership at universities. SAE has had to work hard to change recently after a string of member deaths, many blamed on the hazing of new recruits, SAE national President Bradley Cohen wrote in a message on the fraternity's website. The fraternity's website lists more than 130 chapters cited or suspended for "health and safety incidents" since 2010. At least 30 of the incidents involved hazing, and dozens more involved alcohol. However, the list is missing numerous incidents from recent months. Among them, according to various media outlets: Yale University banned the SAEs from campus activities last month after members allegedly tried to interfere with a sexual misconduct investigation connected to an initiation rite. Stanford University in December suspended SAE housing privileges after finding sorority members attending a fraternity function were subjected to graphic sexual content. And Johns Hopkins University in November suspended the fraternity for underage drinking. "The media has labeled us as the 'nation's deadliest fraternity,' " Cohen said. In 2011, for example, a student died while being coerced into excessive alcohol consumption, according to a lawsuit. SAE's previous insurer dumped the fraternity. "As a result, we are paying Lloyd's of London the highest insurance rates in the Greek-letter world," Cohen said. Universities have turned down SAE's attempts to open new chapters, and the fraternity had to close 12 in 18 months over hazing incidents.""" + + EXPECTED_SUMMARY_STUDENTS = """SAE's national chapter suspended the students, but university president says it's permanent.\nSAE's national chapter has had to work hard to change recently.\nSAE's chapter has more than 200,000 members.\nSAE's chapter has been criticized for its hazing of new recruits.""" + + input_dict = tokenizer_in(ARTICLE_STUDENTS, return_tensors="np") + output_ids = model.generate(input_dict["input_ids"]).sequences + summary = tokenizer_out.batch_decode(output_ids, skip_special_tokens=True) + + self.assertEqual(summary, [EXPECTED_SUMMARY_STUDENTS]) + + +@require_flax +class FlaxBartEncoderDecoderModelTest(FlaxEncoderDecoderMixin, unittest.TestCase): + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = FlaxBertModel(config) + decoder_model = FlaxBartForCausalLM(decoder_config) + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + model_tester_encoder = FlaxBertModelTester(self, batch_size=13) + model_tester_decoder = FlaxBartStandaloneDecoderModelTester(self, batch_size=13) + encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs() + decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder() + (config, input_ids, token_type_ids, attention_mask) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_attention_mask, + encoder_hidden_states, + encoder_attention_mask, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + return { + "config": config, + "input_ids": input_ids, + "attention_mask": attention_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": decoder_attention_mask, + "encoder_hidden_states": encoder_hidden_states, + } + + def get_pretrained_model(self): + return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained( + "google-bert/bert-base-cased", "facebook/bart-base" + ) + + +@require_flax +class FlaxBertEncoderDecoderModelTest(FlaxEncoderDecoderMixin, unittest.TestCase): + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = FlaxBertModel(config) + decoder_model = FlaxBertForCausalLM(decoder_config) + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + model_tester_encoder = FlaxBertModelTester(self, batch_size=13) + model_tester_decoder = FlaxBertModelTester(self, batch_size=13) + encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs() + decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs_for_decoder() + (config, input_ids, token_type_ids, attention_mask) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_attention_mask, + encoder_hidden_states, + encoder_attention_mask, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + return { + "config": config, + "input_ids": input_ids, + "attention_mask": attention_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": decoder_attention_mask, + "encoder_hidden_states": encoder_hidden_states, + } + + def get_pretrained_model(self): + return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained( + "google-bert/bert-base-cased", "google-bert/bert-base-cased" + ) + + +@require_flax +class FlaxEncoderDecoderModelTest(unittest.TestCase): + def get_from_encoderdecoder_pretrained_model(self): + return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained( + "google-bert/bert-base-cased", "openai-community/gpt2" + ) + + def _check_configuration_tie(self, model): + module = model.module.bind(model.params) + + assert id(module.decoder.config) == id(model.config.decoder) + assert id(module.encoder.config) == id(model.config.encoder) + + @slow + def test_configuration_tie(self): + model = self.get_from_encoderdecoder_pretrained_model() + self._check_configuration_tie(model) diff --git a/docs/transformers/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py b/docs/transformers/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..5e1da3242b90c2e5919e379023536197d7470762 --- /dev/null +++ b/docs/transformers/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py @@ -0,0 +1,850 @@ +# Copyright 2020 HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import os +import tempfile +import unittest + +import numpy as np + +from transformers import is_tf_available +from transformers.testing_utils import require_tf, slow + +from ...test_modeling_tf_common import ids_tensor +from ..bert.test_modeling_tf_bert import TFBertModelTester +from ..gpt2.test_modeling_tf_gpt2 import TFGPT2ModelTester +from ..rembert.test_modeling_tf_rembert import TFRemBertModelTester +from ..roberta.test_modeling_tf_roberta import TFRobertaModelTester + + +if is_tf_available(): + from transformers import ( + AutoConfig, + AutoTokenizer, + EncoderDecoderConfig, + TFAutoModel, + TFAutoModelForCausalLM, + TFBertLMHeadModel, + TFBertModel, + TFEncoderDecoderModel, + TFGPT2LMHeadModel, + TFRemBertForCausalLM, + TFRemBertModel, + TFRobertaForCausalLM, + TFRobertaModel, + ) + from transformers.modeling_tf_outputs import TFBaseModelOutput + + +@require_tf +class TFEncoderDecoderMixin: + def get_encoder_decoder_model(self, config, decoder_config): + raise NotImplementedError + + def prepare_config_and_inputs(self): + raise NotImplementedError + + def get_pretrained_model(self): + raise NotImplementedError + + def check_encoder_decoder_model_from_pretrained_configs( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) + self.assertTrue(encoder_decoder_config.decoder.is_decoder) + + enc_dec_model = TFEncoderDecoderModel(encoder_decoder_config) + + self.assertTrue(enc_dec_model.config.is_encoder_decoder) + + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + kwargs=kwargs, + ) + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + def check_encoder_decoder_model( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + self.assertTrue(enc_dec_model.config.decoder.is_decoder) + self.assertTrue(enc_dec_model.config.decoder.add_cross_attention) + self.assertTrue(enc_dec_model.config.is_encoder_decoder) + + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + kwargs=kwargs, + ) + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + encoder_outputs = TFBaseModelOutput(last_hidden_state=encoder_hidden_states) + outputs_encoder_decoder = enc_dec_model( + input_ids=None, + encoder_outputs=encoder_outputs, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + kwargs=kwargs, + ) + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + def check_encoder_decoder_model_from_pretrained( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + return_dict, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} + enc_dec_model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + return_dict=True, + kwargs=kwargs, + ) + + self.assertEqual( + outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) + ) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + def check_save_and_load( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + + outputs = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + kwargs=kwargs, + ) + out_2 = np.array(outputs[0]) + out_2[np.isnan(out_2)] = 0 + + with tempfile.TemporaryDirectory() as tmpdirname: + enc_dec_model.save_pretrained(tmpdirname) + enc_dec_model = TFEncoderDecoderModel.from_pretrained(tmpdirname) + + after_outputs = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + kwargs=kwargs, + ) + out_1 = np.array(after_outputs[0]) + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + def check_encoder_decoder_model_labels( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + labels, + **kwargs, + ): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + labels=labels, + kwargs=kwargs, + ) + + # Make sure `loss` exist + self.assertIn("loss", outputs_encoder_decoder) + + batch_size, seq_len = decoder_input_ids.shape + expected_shape = (batch_size, seq_len, decoder_config.vocab_size) + self.assertEqual(outputs_encoder_decoder["logits"].shape, expected_shape) + self.assertEqual( + outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,)) + ) + + def _check_output_with_attentions( + self, outputs_encoder_decoder, config, input_ids, decoder_config, decoder_input_ids + ): + encoder_attentions = outputs_encoder_decoder["encoder_attentions"] + self.assertEqual(len(encoder_attentions), config.num_hidden_layers) + + self.assertEqual( + encoder_attentions[0].shape[-3:], (config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]) + ) + + decoder_attentions = outputs_encoder_decoder["decoder_attentions"] + num_decoder_layers = ( + decoder_config.num_decoder_layers + if hasattr(decoder_config, "num_decoder_layers") + else decoder_config.num_hidden_layers + ) + self.assertEqual(len(decoder_attentions), num_decoder_layers) + + self.assertEqual( + decoder_attentions[0].shape[-3:], + (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]), + ) + + cross_attentions = outputs_encoder_decoder["cross_attentions"] + self.assertEqual(len(cross_attentions), num_decoder_layers) + + cross_attention_input_seq_len = decoder_input_ids.shape[-1] * ( + 1 + (decoder_config.ngram if hasattr(decoder_config, "ngram") else 0) + ) + self.assertEqual( + cross_attentions[0].shape[-3:], + (decoder_config.num_attention_heads, cross_attention_input_seq_len, input_ids.shape[-1]), + ) + + def check_encoder_decoder_model_output_attentions( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + # make the decoder inputs a different shape from the encoder inputs to harden the test + decoder_input_ids = decoder_input_ids[:, :-1] + decoder_attention_mask = decoder_attention_mask[:, :-1] + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + output_attentions=True, + kwargs=kwargs, + ) + self._check_output_with_attentions( + outputs_encoder_decoder, config, input_ids, decoder_config, decoder_input_ids + ) + + def check_encoder_decoder_model_output_attentions_from_config( + self, + config, + input_ids, + attention_mask, + encoder_hidden_states, + decoder_config, + decoder_input_ids, + decoder_attention_mask, + **kwargs, + ): + # Similar to `check_encoder_decoder_model_output_attentions`, but with `output_attentions` triggered from the + # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded + # from the inner models' configurations. + + decoder_input_ids = decoder_input_ids[:, :-1] + decoder_attention_mask = decoder_attention_mask[:, :-1] + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + enc_dec_model.config.output_attentions = True # model config -> won't work + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + kwargs=kwargs, + ) + self.assertTrue( + all( + key not in outputs_encoder_decoder + for key in ["encoder_attentions", "decoder_attentions", "cross_attentions"] + ) + ) + + config.output_attentions = True # inner model config -> will work + decoder_config.output_attentions = True + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + outputs_encoder_decoder = enc_dec_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, + kwargs=kwargs, + ) + self._check_output_with_attentions( + outputs_encoder_decoder, config, input_ids, decoder_config, decoder_input_ids + ) + + def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config, **kwargs): + encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) + enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + + # Generate until max length + if hasattr(enc_dec_model.config, "eos_token_id"): + enc_dec_model.config.eos_token_id = None + if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"): + enc_dec_model.config.decoder.eos_token_id = None + if hasattr(enc_dec_model.generation_config, "eos_token_id"): + enc_dec_model.generation_config.eos_token_id = None + + # Bert does not have a bos token id, so use pad_token_id instead + generated_output = enc_dec_model.generate( + input_ids, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id + ) + self.assertEqual(tuple(generated_output.shape.as_list()), (input_ids.shape[0],) + (decoder_config.max_length,)) + + def test_encoder_decoder_model(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model(**input_ids_dict) + + def test_encoder_decoder_model_from_pretrained_configs(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict) + + def test_encoder_decoder_model_from_pretrained(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=False) + + def test_encoder_decoder_model_from_pretrained_return_dict(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=True) + + def test_save_and_load_from_pretrained(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_save_and_load(**input_ids_dict) + + def test_encoder_decoder_model_labels(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_labels(**input_ids_dict) + + def test_encoder_decoder_model_output_attentions(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_output_attentions(**input_ids_dict) + + def test_encoder_decoder_model_output_attentions_from_config(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_output_attentions_from_config(**input_ids_dict) + + def test_encoder_decoder_model_generate(self): + input_ids_dict = self.prepare_config_and_inputs() + self.check_encoder_decoder_model_generate(**input_ids_dict) + + def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float): + diff = np.abs(a - b).max() + self.assertLessEqual(diff, tol, f"Difference between torch and tf is {diff} (>= {tol}).") + + def test_model_save_load_from_pretrained(self): + model_2 = self.get_pretrained_model() + input_ids = ids_tensor([13, 5], model_2.config.encoder.vocab_size) + decoder_input_ids = ids_tensor([13, 1], model_2.config.decoder.vocab_size) + attention_mask = ids_tensor([13, 5], vocab_size=2) + + outputs = model_2( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + ) + out_2 = np.array(outputs[0]) + out_2[np.isnan(out_2)] = 0 + + with tempfile.TemporaryDirectory() as tmp_dirname: + model_2.save_pretrained(tmp_dirname) + model_1 = TFEncoderDecoderModel.from_pretrained(tmp_dirname) + + after_outputs = model_1( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + attention_mask=attention_mask, + ) + out_1 = np.array(after_outputs[0]) + out_1[np.isnan(out_1)] = 0 + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) + + +@require_tf +class TFBertEncoderDecoderModelTest(TFEncoderDecoderMixin, unittest.TestCase): + def setUp(self): + self.encoder_model_tester = TFBertModelTester(self, batch_size=13) + self.decoder_model_tester = TFBertModelTester(self, batch_size=13) + + def get_pretrained_model(self): + return TFEncoderDecoderModel.from_encoder_decoder_pretrained( + "hf-internal-testing/tiny-random-bert", + "hf-internal-testing/tiny-random-bert", + ) + + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = TFBertModel(config, name="encoder") + decoder_model = TFBertLMHeadModel(decoder_config, name="decoder") + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + encoder_config_and_inputs = self.encoder_model_tester.prepare_config_and_inputs() + decoder_config_and_inputs = self.decoder_model_tester.prepare_config_and_inputs_for_decoder() + ( + config, + input_ids, + token_type_ids, + attention_mask, + sequence_labels, + token_labels, + choice_labels, + ) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_token_type_ids, + decoder_attention_mask, + decoder_sequence_labels, + decoder_token_labels, + decoder_choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + # disable cache for now + decoder_config.use_cache = False + return { + "config": config, + "input_ids": input_ids, + "attention_mask": attention_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_token_type_ids": decoder_token_type_ids, + "decoder_attention_mask": decoder_attention_mask, + "decoder_sequence_labels": decoder_sequence_labels, + "decoder_token_labels": decoder_token_labels, + "decoder_choice_labels": decoder_choice_labels, + "encoder_hidden_states": encoder_hidden_states, + "labels": decoder_token_labels, + } + + +@require_tf +class TFGPT2EncoderDecoderModelTest(TFEncoderDecoderMixin, unittest.TestCase): + def setUp(self): + self.encoder_model_tester = TFBertModelTester(self, batch_size=13) + self.decoder_model_tester = TFGPT2ModelTester(self) + + def get_pretrained_model(self): + return TFEncoderDecoderModel.from_encoder_decoder_pretrained( + "hf-internal-testing/tiny-random-bert", + "hf-internal-testing/tiny-random-gpt2", + ) + + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = TFBertModel(config, name="encoder") + decoder_model = TFGPT2LMHeadModel(decoder_config, name="decoder") + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + encoder_config_and_inputs = self.encoder_model_tester.prepare_config_and_inputs() + decoder_config_and_inputs = self.decoder_model_tester.prepare_config_and_inputs_for_decoder() + ( + config, + input_ids, + token_type_ids, + attention_mask, + sequence_labels, + token_labels, + choice_labels, + ) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_attention_mask, + decoder_head_mask, + decoder_token_type_ids, + decoder_sequence_labels, + decoder_token_labels, + decoder_choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + # disable cache for now + decoder_config.use_cache = False + return { + "config": config, + "input_ids": input_ids, + "attention_mask": attention_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_token_type_ids": decoder_token_type_ids, + "decoder_attention_mask": decoder_attention_mask, + "decoder_sequence_labels": decoder_sequence_labels, + "decoder_token_labels": decoder_token_labels, + "decoder_choice_labels": decoder_choice_labels, + "encoder_hidden_states": encoder_hidden_states, + "labels": decoder_token_labels, + } + + +@require_tf +class TFRoBertaEncoderDecoderModelTest(TFEncoderDecoderMixin, unittest.TestCase): + def setUp(self): + self.encoder_model_tester = TFRobertaModelTester(self) + self.decoder_model_tester = TFRobertaModelTester(self) + + def get_pretrained_model(self): + return TFEncoderDecoderModel.from_encoder_decoder_pretrained( + "hf-internal-testing/tiny-random-roberta", + "hf-internal-testing/tiny-random-roberta", + ) + + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = TFRobertaModel(config, name="encoder") + decoder_model = TFRobertaForCausalLM(decoder_config, name="decoder") + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + encoder_config_and_inputs = self.encoder_model_tester.prepare_config_and_inputs() + decoder_config_and_inputs = self.decoder_model_tester.prepare_config_and_inputs_for_decoder() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_token_type_ids, + decoder_input_mask, + decoder_sequence_labels, + decoder_token_labels, + decoder_choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + # disable cache for now + decoder_config.use_cache = False + return { + "config": config, + "input_ids": input_ids, + "attention_mask": input_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_token_type_ids": decoder_token_type_ids, + "decoder_attention_mask": decoder_input_mask, + "decoder_sequence_labels": decoder_sequence_labels, + "decoder_token_labels": decoder_token_labels, + "decoder_choice_labels": decoder_choice_labels, + "encoder_hidden_states": encoder_hidden_states, + "labels": decoder_token_labels, + } + + +@require_tf +class TFRembertEncoderDecoderModelTest(TFEncoderDecoderMixin, unittest.TestCase): + def setUp(self): + self.encoder_model_tester = TFRemBertModelTester(self) + self.decoder_model_tester = TFRemBertModelTester(self) + + def get_pretrained_model(self): + return TFEncoderDecoderModel.from_encoder_decoder_pretrained( + "hf-internal-testing/tiny-random-rembert", + "hf-internal-testing/tiny-random-rembert", + ) + + def get_encoder_decoder_model(self, config, decoder_config): + encoder_model = TFRemBertModel(config, name="encoder") + decoder_model = TFRemBertForCausalLM(decoder_config, name="decoder") + return encoder_model, decoder_model + + def prepare_config_and_inputs(self): + encoder_config_and_inputs = self.encoder_model_tester.prepare_config_and_inputs() + decoder_config_and_inputs = self.decoder_model_tester.prepare_config_and_inputs_for_decoder() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = encoder_config_and_inputs + ( + decoder_config, + decoder_input_ids, + decoder_token_type_ids, + decoder_input_mask, + decoder_sequence_labels, + decoder_token_labels, + decoder_choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = decoder_config_and_inputs + + # make sure that cross attention layers are added + decoder_config.add_cross_attention = True + # disable cache for now + decoder_config.use_cache = False + return { + "config": config, + "input_ids": input_ids, + "attention_mask": input_mask, + "decoder_config": decoder_config, + "decoder_input_ids": decoder_input_ids, + "decoder_token_type_ids": decoder_token_type_ids, + "decoder_attention_mask": decoder_input_mask, + "decoder_sequence_labels": decoder_sequence_labels, + "decoder_token_labels": decoder_token_labels, + "decoder_choice_labels": decoder_choice_labels, + "encoder_hidden_states": encoder_hidden_states, + "labels": decoder_token_labels, + } + + +@require_tf +class TFEncoderDecoderModelTest(unittest.TestCase): + def get_from_encoderdecoder_pretrained_model(self): + return TFEncoderDecoderModel.from_encoder_decoder_pretrained( + "google-bert/bert-base-cased", "google-bert/bert-base-cased" + ) + + def get_decoder_config(self): + config = AutoConfig.from_pretrained("google-bert/bert-base-cased") + config.is_decoder = True + config.add_cross_attention = True + return config + + def get_encoderdecoder_model(self): + return TFEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16") + + def get_encoder_decoder_models(self): + encoder_model = TFBertModel.from_pretrained("google-bert/bert-base-cased", name="encoder") + decoder_model = TFBertLMHeadModel.from_pretrained( + "google-bert/bert-base-cased", config=self.get_decoder_config(), name="decoder" + ) + return {"encoder": encoder_model, "decoder": decoder_model} + + def _check_configuration_tie(self, model): + assert id(model.decoder.config) == id(model.config.decoder) + assert id(model.encoder.config) == id(model.config.encoder) + + @slow + def test_configuration_tie(self): + model = self.get_from_encoderdecoder_pretrained_model() + self._check_configuration_tie(model) + + model = TFEncoderDecoderModel(**self.get_encoder_decoder_models()) + self._check_configuration_tie(model) + + # # This should be enabled once we upload the TF version of + # # "patrickvonplaten/bert2bert-cnn_dailymail-fp16" to the Hub. + # model = self.get_encoderdecoder_model() + # self._check_configuration_tie(model) + + +@require_tf +class TFEncoderDecoderModelSaveLoadTests(unittest.TestCase): + def get_encoder_decoder_config(self): + encoder_config = AutoConfig.from_pretrained("google-bert/bert-base-uncased") + decoder_config = AutoConfig.from_pretrained( + "google-bert/bert-base-uncased", is_decoder=True, add_cross_attention=True + ) + return EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config) + + def get_encoder_decoder_config_small(self): + encoder_config = AutoConfig.from_pretrained("hf-internal-testing/tiny-bert") + decoder_config = AutoConfig.from_pretrained( + "hf-internal-testing/tiny-bert", is_decoder=True, add_cross_attention=True + ) + return EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config) + + def test_encoder_decoder_save_load_from_encoder_decoder(self): + config = self.get_encoder_decoder_config_small() + + # create two random BERT models for bert2bert & initialize weights (+cross_attention weights) + encoder = TFBertModel(config.encoder) + encoder.build_in_name_scope() + decoder = TFBertLMHeadModel(config.decoder) + decoder.build_in_name_scope() + + encoder_decoder_orig = TFEncoderDecoderModel(encoder=encoder, decoder=decoder) + + input_ids = ids_tensor([13, 5], encoder.config.vocab_size) + decoder_input_ids = ids_tensor([13, 1], decoder.config.vocab_size) + + logits_orig = encoder_decoder_orig(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits + + with tempfile.TemporaryDirectory() as tmp_dirname: + encoder_path = os.path.join(tmp_dirname, "encoder") + decoder_path = os.path.join(tmp_dirname, "decoder") + + encoder.save_pretrained(encoder_path) + decoder.save_pretrained(decoder_path) + + encoder_decoder = TFEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_path, decoder_path) + + logits_1 = encoder_decoder(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits + + self.assertTrue(logits_orig.numpy().sum() - logits_1.numpy().sum() < 1e-3) + + max_diff = np.max(np.abs(logits_1.numpy() - logits_orig.numpy())) + self.assertAlmostEqual(max_diff, 0.0, places=4) + + with tempfile.TemporaryDirectory() as tmp_dirname: + encoder_decoder.save_pretrained(tmp_dirname) + encoder_decoder = TFEncoderDecoderModel.from_pretrained(tmp_dirname) + + logits_2 = encoder_decoder(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits + + max_diff = np.max(np.abs(logits_2.numpy() - logits_orig.numpy())) + self.assertAlmostEqual(max_diff, 0.0, places=4) + + @slow + def test_encoder_decoder_from_pretrained(self): + load_weight_prefix = TFEncoderDecoderModel.load_weight_prefix + + config = self.get_encoder_decoder_config() + encoder_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + decoder_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + + input_ids = encoder_tokenizer("who sings does he love me with reba", return_tensors="tf").input_ids + decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids + + with tempfile.TemporaryDirectory() as tmp_dirname: + # Since most of HF's models don't have pretrained cross-attention layers, they are randomly + # initialized even if we create models using `from_pretrained` method. + # For the tests, the decoder need to be a model with pretrained cross-attention layers. + # So we create pretrained models (without `load_weight_prefix`), save them, and later, + # we load them using `from_pretrained`. + # (we don't need to do this for encoder, but let's make the code more similar between encoder/decoder) + encoder = TFAutoModel.from_pretrained("google-bert/bert-base-uncased", name="encoder") + # It's necessary to specify `add_cross_attention=True` here. + decoder = TFAutoModelForCausalLM.from_pretrained( + "google-bert/bert-base-uncased", is_decoder=True, add_cross_attention=True, name="decoder" + ) + pretrained_encoder_dir = os.path.join(tmp_dirname, "pretrained_encoder") + pretrained_decoder_dir = os.path.join(tmp_dirname, "pretrained_decoder") + encoder.save_pretrained(pretrained_encoder_dir) + decoder.save_pretrained(pretrained_decoder_dir) + del encoder + del decoder + + enc_dec_model = TFEncoderDecoderModel.from_encoder_decoder_pretrained( + pretrained_encoder_dir, + pretrained_decoder_dir, + ) + # check that the from pretrained methods work + enc_dec_model.save_pretrained(tmp_dirname) + enc_dec_model = TFEncoderDecoderModel.from_pretrained(tmp_dirname) + + output = enc_dec_model(input_ids, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids) + + loss_pretrained = output.loss + del enc_dec_model + + # Create the model using `__init__` with loaded ``pretrained`` encoder / decoder + encoder = TFAutoModel.from_pretrained( + pretrained_encoder_dir, load_weight_prefix=load_weight_prefix, name="encoder" + ) + decoder = TFAutoModelForCausalLM.from_pretrained( + pretrained_decoder_dir, load_weight_prefix=load_weight_prefix, name="decoder" + ) + enc_dec_model = TFEncoderDecoderModel(config=config, encoder=encoder, decoder=decoder) + + output = enc_dec_model(input_ids, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids) + + loss_init = output.loss + + max_diff = np.max(np.abs(loss_pretrained - loss_init)) + expected_diff = 0.0 + + self.assertAlmostEqual(max_diff, expected_diff, places=4) diff --git a/docs/transformers/tests/models/ernie/__init__.py b/docs/transformers/tests/models/ernie/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/ernie/test_modeling_ernie.py b/docs/transformers/tests/models/ernie/test_modeling_ernie.py new file mode 100644 index 0000000000000000000000000000000000000000..7e99ba8e81db2c0acfb8d27d622745c24cbac79a --- /dev/null +++ b/docs/transformers/tests/models/ernie/test_modeling_ernie.py @@ -0,0 +1,591 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import tempfile +import unittest + +from transformers import ErnieConfig, is_torch_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + MODEL_FOR_PRETRAINING_MAPPING, + ErnieForCausalLM, + ErnieForMaskedLM, + ErnieForMultipleChoice, + ErnieForNextSentencePrediction, + ErnieForPreTraining, + ErnieForQuestionAnswering, + ErnieForSequenceClassification, + ErnieForTokenClassification, + ErnieModel, + ) + + +class ErnieModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + """ + Returns a tiny configuration by default. + """ + return ErnieConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + config.is_decoder = True + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ErnieModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_model_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.add_cross_attention = True + model = ErnieModel(config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_for_causal_lm( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + model = ErnieForCausalLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ErnieForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_model_for_causal_lm_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.add_cross_attention = True + model = ErnieForCausalLM(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=token_labels, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=token_labels, + encoder_hidden_states=encoder_hidden_states, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_decoder_model_past_large_inputs( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.is_decoder = True + config.add_cross_attention = True + model = ErnieForCausalLM(config=config).to(torch_device).eval() + + # first forward pass + outputs = model( + input_ids, + attention_mask=input_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=True, + ) + past_key_values = outputs.past_key_values + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model( + next_input_ids, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_hidden_states=True, + )["hidden_states"][0] + output_from_past = model( + next_tokens, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_hidden_states=True, + )["hidden_states"][0] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_for_next_sequence_prediction( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ErnieForNextSentencePrediction(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=sequence_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, 2)) + + def create_and_check_for_pretraining( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ErnieForPreTraining(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + labels=token_labels, + next_sentence_label=sequence_labels, + ) + self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = ErnieForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = ErnieForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = ErnieForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = ErnieForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + ErnieModel, + ErnieForCausalLM, + ErnieForMaskedLM, + ErnieForMultipleChoice, + ErnieForNextSentencePrediction, + ErnieForPreTraining, + ErnieForQuestionAnswering, + ErnieForSequenceClassification, + ErnieForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": ErnieModel, + "fill-mask": ErnieForMaskedLM, + "question-answering": ErnieForQuestionAnswering, + "text-classification": ErnieForSequenceClassification, + "text-generation": ErnieForCausalLM, + "token-classification": ErnieForTokenClassification, + "zero-shot": ErnieForSequenceClassification, + } + if is_torch_available() + else {} + ) + fx_compatible = False + + # special case for ForPreTraining model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING): + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device + ) + inputs_dict["next_sentence_label"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + return inputs_dict + + def setUp(self): + self.model_tester = ErnieModelTester(self) + self.config_tester = ConfigTester(self, config_class=ErnieConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_as_decoder(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) + + def test_model_as_decoder_with_default_input_mask(self): + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) = self.model_tester.prepare_config_and_inputs_for_decoder() + + input_mask = None + + self.model_tester.create_and_check_model_as_decoder( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def test_for_causal_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_for_causal_lm(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_causal_lm_decoder(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_model_for_causal_lm_as_decoder(*config_and_inputs) + + def test_decoder_model_past_with_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) + + def test_decoder_model_past_with_large_inputs_relative_pos_emb(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + config_and_inputs[0].position_embedding_type = "relative_key" + self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_next_sequence_prediction(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs) + + def test_for_pretraining(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_pretraining(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "nghuyong/ernie-1.0-base-zh" + model = ErnieModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @slow + @require_torch_accelerator + def test_torchscript_device_change(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + if model_class == ErnieForMultipleChoice: + self.skipTest(reason="ErnieForMultipleChoice behaves incorrectly in JIT environments.") + + config.torchscript = True + model = model_class(config=config) + + inputs_dict = self._prepare_for_class(inputs_dict, model_class) + traced_model = torch.jit.trace( + model, (inputs_dict["input_ids"].to("cpu"), inputs_dict["attention_mask"].to("cpu")) + ) + + with tempfile.TemporaryDirectory() as tmp: + torch.jit.save(traced_model, os.path.join(tmp, "ernie.pt")) + loaded = torch.jit.load(os.path.join(tmp, "ernie.pt"), map_location=torch_device) + loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device)) diff --git a/docs/transformers/tests/models/esm/__init__.py b/docs/transformers/tests/models/esm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/esm/test_modeling_esm.py b/docs/transformers/tests/models/esm/test_modeling_esm.py new file mode 100644 index 0000000000000000000000000000000000000000..74f4c277d09271ba23723470f0e67be4076d5321 --- /dev/null +++ b/docs/transformers/tests/models/esm/test_modeling_esm.py @@ -0,0 +1,346 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch ESM model.""" + +import unittest + +from transformers import EsmConfig, is_torch_available +from transformers.testing_utils import TestCasePlus, require_bitsandbytes, require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import EsmForMaskedLM, EsmForSequenceClassification, EsmForTokenClassification, EsmModel + from transformers.models.esm.modeling_esm import ( + EsmEmbeddings, + create_position_ids_from_input_ids, + ) + + +# copied from tests.test_modeling_roberta +class EsmModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=False, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=33, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return EsmConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + pad_token_id=1, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + model = EsmModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = EsmForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_token_classification( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = EsmForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_forward_and_backwards( + self, + config, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + gradient_checkpointing=False, + ): + model = EsmForMaskedLM(config) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + model.to(torch_device) + result = model(input_ids, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + result.loss.backward() + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class EsmModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + test_mismatched_shapes = False + + all_model_classes = ( + ( + EsmForMaskedLM, + EsmModel, + EsmForSequenceClassification, + EsmForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": EsmModel, + "fill-mask": EsmForMaskedLM, + "text-classification": EsmForSequenceClassification, + "token-classification": EsmForTokenClassification, + "zero-shot": EsmForSequenceClassification, + } + if is_torch_available() + else {} + ) + test_sequence_classification_problem_types = True + model_split_percents = [0.5, 0.8, 0.9] + + def setUp(self): + self.model_tester = EsmModelTester(self) + self.config_tester = ConfigTester(self, config_class=EsmConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + def test_esm_gradient_checkpointing(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True) + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/esm2_t6_8M_UR50D" + model = EsmModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_create_position_ids_respects_padding_index(self): + """This is a regression test for https://github.com/huggingface/transformers/issues/1761 + + The position ids should be masked with the embedding object's padding index. Therefore, the + first available non-padding position index is EsmEmbeddings.padding_idx + 1 + """ + config = self.model_tester.prepare_config_and_inputs()[0] + model = EsmEmbeddings(config=config) + + input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]]) + expected_positions = torch.as_tensor( + [ + [ + 0 + model.padding_idx + 1, + 1 + model.padding_idx + 1, + 2 + model.padding_idx + 1, + model.padding_idx, + ] + ] + ) + position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx) + self.assertEqual(position_ids.shape, expected_positions.shape) + self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) + + def test_create_position_ids_from_inputs_embeds(self): + """This is a regression test for https://github.com/huggingface/transformers/issues/1761 + + The position ids should be masked with the embedding object's padding index. Therefore, the + first available non-padding position index is EsmEmbeddings.padding_idx + 1 + """ + config = self.model_tester.prepare_config_and_inputs()[0] + embeddings = EsmEmbeddings(config=config) + + inputs_embeds = torch.empty(2, 4, 30) + expected_single_positions = [ + 0 + embeddings.padding_idx + 1, + 1 + embeddings.padding_idx + 1, + 2 + embeddings.padding_idx + 1, + 3 + embeddings.padding_idx + 1, + ] + expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions]) + position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds) + self.assertEqual(position_ids.shape, expected_positions.shape) + self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) + + @unittest.skip(reason="Esm does not support embedding resizing") + def test_resize_embeddings_untied(self): + pass + + @unittest.skip(reason="Esm does not support embedding resizing") + def test_resize_tokens_embeddings(self): + pass + + +@slow +@require_torch +class EsmModelIntegrationTest(TestCasePlus): + def test_inference_masked_lm(self): + with torch.no_grad(): + model = EsmForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D") + model.eval() + input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]]) + output = model(input_ids)[0] + + vocab_size = 33 + + expected_shape = torch.Size((1, 6, vocab_size)) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor( + [[[8.9215, -10.5898, -6.4671], [-6.3967, -13.9114, -1.1212], [-7.7812, -13.9516, -3.7406]]] + ) + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + def test_inference_no_head(self): + with torch.no_grad(): + model = EsmModel.from_pretrained("facebook/esm2_t6_8M_UR50D") + model.eval() + + input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]]) + output = model(input_ids)[0] + # compare the actual values for a slice. + expected_slice = torch.tensor( + [[[0.1444, 0.5413, 0.3248], [0.3034, 0.0053, 0.3108], [0.3228, -0.2499, 0.3415]]] + ) + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + @require_bitsandbytes + def test_inference_bitsandbytes(self): + model = EsmForMaskedLM.from_pretrained("facebook/esm2_t36_3B_UR50D", load_in_8bit=True) + + input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]]).to(model.device) + # Just test if inference works + with torch.no_grad(): + _ = model(input_ids)[0] + + model = EsmForMaskedLM.from_pretrained("facebook/esm2_t36_3B_UR50D", load_in_4bit=True) + + input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]]).to(model.device) + # Just test if inference works + _ = model(input_ids)[0] diff --git a/docs/transformers/tests/models/esm/test_modeling_esmfold.py b/docs/transformers/tests/models/esm/test_modeling_esmfold.py new file mode 100644 index 0000000000000000000000000000000000000000..b13e7fe58b1d4d96f230ab6684bfb39de9e93a7d --- /dev/null +++ b/docs/transformers/tests/models/esm/test_modeling_esmfold.py @@ -0,0 +1,279 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch ESM model.""" + +import unittest + +from transformers import EsmConfig, is_torch_available +from transformers.testing_utils import TestCasePlus, is_flaky, require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers.models.esm.modeling_esmfold import EsmForProteinFolding + + +class EsmFoldModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=False, + use_input_mask=True, + use_token_type_ids=False, + use_labels=False, + vocab_size=19, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + esmfold_config = { + "trunk": { + "num_blocks": 2, + "sequence_state_dim": 64, + "pairwise_state_dim": 16, + "sequence_head_width": 4, + "pairwise_head_width": 4, + "position_bins": 4, + "chunk_size": 16, + "structure_module": { + "ipa_dim": 16, + "num_angles": 7, + "num_blocks": 2, + "num_heads_ipa": 4, + "pairwise_dim": 16, + "resnet_dim": 16, + "sequence_dim": 48, + }, + }, + "fp16_esm": False, + "lddt_head_hid_dim": 16, + } + config = EsmConfig( + vocab_size=33, + hidden_size=self.hidden_size, + pad_token_id=1, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + is_folding_model=True, + esmfold_config=esmfold_config, + ) + return config + + def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + model = EsmForProteinFolding(config=config).float() + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + result = model(input_ids) + + self.parent.assertEqual(result.positions.shape, (2, self.batch_size, self.seq_length, 14, 3)) + self.parent.assertEqual(result.angles.shape, (2, self.batch_size, self.seq_length, 7, 2)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class EsmFoldModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + test_mismatched_shapes = False + + all_model_classes = (EsmForProteinFolding,) if is_torch_available() else () + pipeline_model_mapping = {} if is_torch_available() else {} + test_sequence_classification_problem_types = False + + def setUp(self): + self.model_tester = EsmFoldModelTester(self) + self.config_tester = ConfigTester(self, config_class=EsmConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @is_flaky( + description="The computed `s = s / norm_denom` in `EsmFoldAngleResnet` is numerically instable if `norm_denom` is very small." + ) + def test_batching_equivalence(self): + super().test_batching_equivalence() + + @unittest.skip(reason="Does not support attention outputs") + def test_attention_outputs(self): + pass + + @unittest.skip + def test_correct_missing_keys(self): + pass + + @unittest.skip(reason="Esm does not support embedding resizing") + def test_resize_embeddings_untied(self): + pass + + @unittest.skip(reason="Esm does not support embedding resizing") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="ESMFold does not support passing input embeds!") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="ESMFold does not support head pruning.") + def test_head_pruning(self): + pass + + @unittest.skip(reason="ESMFold does not support head pruning.") + def test_head_pruning_integration(self): + pass + + @unittest.skip(reason="ESMFold does not support head pruning.") + def test_head_pruning_save_load_from_config_init(self): + pass + + @unittest.skip(reason="ESMFold does not support head pruning.") + def test_head_pruning_save_load_from_pretrained(self): + pass + + @unittest.skip(reason="ESMFold does not support head pruning.") + def test_headmasking(self): + pass + + @unittest.skip(reason="ESMFold does not output hidden states in the normal way.") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="ESMfold does not output hidden states in the normal way.") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="ESMFold only has one output format.") + def test_model_outputs_equivalence(self): + pass + + @unittest.skip(reason="ESMFold does not support input chunking.") + def test_feed_forward_chunking(self): + pass + + @unittest.skip( + reason="ESMFold doesn't respect you and it certainly doesn't respect your initialization arguments." + ) + def test_initialization(self): + pass + + @unittest.skip(reason="ESMFold doesn't support torchscript compilation.") + def test_torchscript_output_attentions(self): + pass + + @unittest.skip(reason="ESMFold doesn't support torchscript compilation.") + def test_torchscript_output_hidden_state(self): + pass + + @unittest.skip(reason="ESMFold doesn't support torchscript compilation.") + def test_torchscript_simple(self): + pass + + @unittest.skip(reason="ESMFold doesn't support data parallel.") + def test_multi_gpu_data_parallel_forward(self): + pass + + +@require_torch +class EsmModelIntegrationTest(TestCasePlus): + @slow + def test_inference_protein_folding(self): + model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1").float() + model.eval() + input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]]) + position_outputs = model(input_ids)["positions"] + expected_slice = torch.tensor([2.5828, 0.7993, -10.9334], dtype=torch.float32) + torch.testing.assert_close(position_outputs[0, 0, 0, 0], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/esm/test_modeling_tf_esm.py b/docs/transformers/tests/models/esm/test_modeling_tf_esm.py new file mode 100644 index 0000000000000000000000000000000000000000..52d163d2dc85e3dee6477d37510d1f9e88df0288 --- /dev/null +++ b/docs/transformers/tests/models/esm/test_modeling_tf_esm.py @@ -0,0 +1,323 @@ +# Copyright 2022 The HuggingFace Inc. Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import unittest + +from transformers import EsmConfig, is_tf_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import numpy + import tensorflow as tf + + from transformers.modeling_tf_utils import keras + from transformers.models.esm.modeling_tf_esm import ( + TFEsmForMaskedLM, + TFEsmForSequenceClassification, + TFEsmForTokenClassification, + TFEsmModel, + ) + + +# copied from tests.test_modeling_tf_roberta +class TFEsmModelTester: + def __init__( + self, + parent, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_input_mask = True + self.use_labels = True + self.vocab_size = 99 + self.hidden_size = 32 + self.num_hidden_layers = 2 + self.num_attention_heads = 4 + self.intermediate_size = 37 + self.hidden_act = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.scope = None + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = EsmConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + pad_token_id=1, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + ) + + return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + config.is_decoder = True + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + model = TFEsmModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask} + result = model(inputs) + + inputs = [input_ids, input_mask] + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_model_as_decoder( + self, + config, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.add_cross_attention = True + + model = TFEsmModel(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "encoder_hidden_states": encoder_hidden_states, + "encoder_attention_mask": encoder_attention_mask, + } + result = model(inputs) + + inputs = [input_ids, input_mask] + result = model(inputs, encoder_hidden_states=encoder_hidden_states) + + # Also check the case where encoder outputs are not passed + result = model(input_ids, attention_mask=input_mask) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFEsmForMaskedLM(config=config) + result = model([input_ids, input_mask]) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_token_classification( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFEsmForTokenClassification(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFEsmModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + TFEsmModel, + TFEsmForMaskedLM, + TFEsmForSequenceClassification, + TFEsmForTokenClassification, + ) + if is_tf_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": TFEsmModel, + "fill-mask": TFEsmForMaskedLM, + "text-classification": TFEsmForSequenceClassification, + "token-classification": TFEsmForTokenClassification, + "zero-shot": TFEsmForSequenceClassification, + } + if is_tf_available() + else {} + ) + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFEsmModelTester(self) + self.config_tester = ConfigTester(self, config_class=EsmConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + """Test the base model""" + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_as_decoder(self): + """Test the base model as a decoder (of an encoder-decoder architecture) + + is_deocder=True + cross_attention + pass encoder outputs + """ + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/esm2_t6_8M_UR50D" + model = TFEsmModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip("Protein models do not support embedding resizing.") + def test_resize_token_embeddings(self): + pass + + @unittest.skip("Protein models do not support embedding resizing.") + def test_save_load_after_resize_token_embeddings(self): + pass + + def test_model_common_attributes(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + assert isinstance(model.get_input_embeddings(), keras.layers.Layer) + if model_class is TFEsmForMaskedLM: + # Output embedding test differs from the main test because they're a matrix, not a layer + name = model.get_bias() + assert isinstance(name, dict) + for k, v in name.items(): + assert isinstance(v, tf.Variable) + else: + x = model.get_output_embeddings() + assert x is None + name = model.get_bias() + assert name is None + + +@require_tf +class TFEsmModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_masked_lm(self): + model = TFEsmForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D") + + input_ids = tf.constant([[0, 1, 2, 3, 4, 5]]) + output = model(input_ids)[0] + expected_shape = [1, 6, 33] + self.assertEqual(list(output.numpy().shape), expected_shape) + # compare the actual values for a slice. + expected_slice = tf.constant( + [ + [ + [8.921518, -10.589814, -6.4671307], + [-6.3967156, -13.911377, -1.1211915], + [-7.781247, -13.951557, -3.740592], + ] + ] + ) + self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-2)) + + @slow + def test_inference_no_head(self): + model = TFEsmModel.from_pretrained("facebook/esm2_t6_8M_UR50D") + + input_ids = tf.constant([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]]) + output = model(input_ids)[0] + # compare the actual values for a slice. + expected_slice = tf.constant( + [ + [ + [0.14443092, 0.54125327, 0.3247739], + [0.30340484, 0.00526676, 0.31077722], + [0.32278043, -0.24987096, 0.3414628], + ] + ] + ) + self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4)) diff --git a/docs/transformers/tests/models/esm/test_tokenization_esm.py b/docs/transformers/tests/models/esm/test_tokenization_esm.py new file mode 100644 index 0000000000000000000000000000000000000000..57c66d53a8c41fe742e2d186ac909fe4bf20fa3d --- /dev/null +++ b/docs/transformers/tests/models/esm/test_tokenization_esm.py @@ -0,0 +1,118 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import tempfile +import unittest +from functools import lru_cache + +from transformers.models.esm.tokenization_esm import VOCAB_FILES_NAMES, EsmTokenizer +from transformers.testing_utils import require_tokenizers +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.tokenization_utils_base import PreTrainedTokenizerBase + +from ...test_tokenization_common import use_cache_if_possible + + +@require_tokenizers +class ESMTokenizationTest(unittest.TestCase): + tokenizer_class = EsmTokenizer + + @classmethod + def setUpClass(cls): + super().setUpClass() + + cls.tmpdirname = tempfile.mkdtemp() + vocab_tokens: list[str] = ["", "", "", "", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "", ""] # fmt: skip + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + def get_tokenizers(cls, **kwargs) -> list[PreTrainedTokenizerBase]: + return [cls.get_tokenizer(**kwargs)] + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + def test_tokenizer_single_example(self): + tokenizer = self.tokenizer_class(self.vocab_file) + + tokens = tokenizer.tokenize("LAGVS") + self.assertListEqual(tokens, ["L", "A", "G", "V", "S"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [4, 5, 6, 7, 8]) + + def test_tokenizer_encode_single(self): + tokenizer = self.tokenizer_class(self.vocab_file) + + seq = "LAGVS" + self.assertListEqual(tokenizer.encode(seq), [0, 4, 5, 6, 7, 8, 2]) + + def test_tokenizer_call_no_pad(self): + tokenizer = self.tokenizer_class(self.vocab_file) + + seq_batch = ["LAGVS", "WCB"] + tokens_batch = tokenizer(seq_batch, padding=False)["input_ids"] + + self.assertListEqual(tokens_batch, [[0, 4, 5, 6, 7, 8, 2], [0, 22, 23, 25, 2]]) + + def test_tokenizer_call_pad(self): + tokenizer = self.tokenizer_class(self.vocab_file) + + seq_batch = ["LAGVS", "WCB"] + tokens_batch = tokenizer(seq_batch, padding=True)["input_ids"] + + self.assertListEqual(tokens_batch, [[0, 4, 5, 6, 7, 8, 2], [0, 22, 23, 25, 2, 1, 1]]) + + def test_tokenize_special_tokens(self): + """Test `tokenize` with special tokens.""" + tokenizers = self.get_tokenizers(fast=True) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + SPECIAL_TOKEN_1 = "" + SPECIAL_TOKEN_2 = "" + + token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1) + token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2) + + self.assertEqual(len(token_1), 1) + self.assertEqual(len(token_2), 1) + self.assertEqual(token_1[0], SPECIAL_TOKEN_1) + self.assertEqual(token_2[0], SPECIAL_TOKEN_2) + + def test_add_tokens(self): + tokenizer = self.tokenizer_class(self.vocab_file) + + vocab_size = len(tokenizer) + self.assertEqual(tokenizer.add_tokens(""), 0) + self.assertEqual(tokenizer.add_tokens("testoken"), 1) + self.assertEqual(tokenizer.add_tokens(["testoken1", "testtoken2"]), 2) + self.assertEqual(len(tokenizer), vocab_size + 3) + + self.assertEqual(tokenizer.add_special_tokens({}), 0) + self.assertEqual(tokenizer.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2) + self.assertRaises(AssertionError, tokenizer.add_special_tokens, {"additional_special_tokens": ""}) + self.assertEqual(tokenizer.add_special_tokens({"additional_special_tokens": [""]}), 1) + self.assertEqual( + tokenizer.add_special_tokens({"additional_special_tokens": ["", ""]}), 2 + ) + self.assertIn("", tokenizer.special_tokens_map["additional_special_tokens"]) + self.assertIsInstance(tokenizer.special_tokens_map["additional_special_tokens"], list) + self.assertGreaterEqual(len(tokenizer.special_tokens_map["additional_special_tokens"]), 2) + + self.assertEqual(len(tokenizer), vocab_size + 8) diff --git a/docs/transformers/tests/models/falcon/__init__.py b/docs/transformers/tests/models/falcon/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/falcon/test_modeling_falcon.py b/docs/transformers/tests/models/falcon/test_modeling_falcon.py new file mode 100644 index 0000000000000000000000000000000000000000..6a63177476bc60e1090fb01ae01748b892bf9b07 --- /dev/null +++ b/docs/transformers/tests/models/falcon/test_modeling_falcon.py @@ -0,0 +1,476 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Falcon model.""" + +import unittest + +from parameterized import parameterized + +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + FalconConfig, + is_torch_available, + set_seed, +) +from transformers.testing_utils import ( + require_bitsandbytes, + require_torch, + require_torch_sdpa, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + FalconForCausalLM, + FalconForQuestionAnswering, + FalconForSequenceClassification, + FalconForTokenClassification, + FalconModel, + ) + from transformers.models.falcon.modeling_falcon import ( + FalconRotaryEmbedding, + ) + + +class FalconModelTester: + def __init__( + self, + parent, + batch_size=3, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return FalconConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=1, + new_decoder_architecture=True, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = FalconModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + FalconModel, + FalconForCausalLM, + FalconForSequenceClassification, + FalconForTokenClassification, + FalconForQuestionAnswering, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": FalconModel, + "question-answering": FalconForQuestionAnswering, + "text-classification": FalconForSequenceClassification, + "text-generation": FalconForCausalLM, + "token-classification": FalconForTokenClassification, + "zero-shot": FalconForSequenceClassification, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + + # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146 + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + return True + + def setUp(self): + self.model_tester = FalconModelTester(self) + self.config_tester = ConfigTester(self, config_class=FalconConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_position_embedding_types(self): + config, *inputs = self.model_tester.prepare_config_and_inputs() + for alibi in [True, False]: + config.alibi = alibi + self.model_tester.create_and_check_model(config, *inputs) + + def test_falcon_sequence_classification_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = FalconForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_falcon_sequence_classification_model_for_single_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "single_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = FalconForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_falcon_sequence_classification_model_for_multi_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "multi_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor( + [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size + ).to(torch.float) + model = FalconForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + @parameterized.expand([("linear",), ("dynamic",)]) + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model_rope_scaling_from_config with Llama->Falcon + def test_model_rope_scaling_from_config(self, scaling_type): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + short_input = ids_tensor([1, 10], config.vocab_size) + long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + original_model = FalconModel(config) + original_model.to(torch_device) + original_model.eval() + original_short_output = original_model(short_input).last_hidden_state + original_long_output = original_model(long_input).last_hidden_state + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + config.rope_scaling = {"type": scaling_type, "factor": 10.0} + scaled_model = FalconModel(config) + scaled_model.to(torch_device) + scaled_model.eval() + scaled_short_output = scaled_model(short_input).last_hidden_state + scaled_long_output = scaled_model(long_input).last_hidden_state + + # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original + # maximum sequence length, so the outputs for the short input should match. + if scaling_type == "dynamic": + torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) + else: + self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) + + # The output should be different for long inputs + self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + + # Copied from tests.models.gpt_neox.test_modeling_gpt_neox.GPTNeoXModelTest.test_model_rope_scaling with GPTNeoX->Falcon + def test_model_rope_scaling(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + + # Inputs + x = torch.randn( + 1, dtype=torch.float32, device=torch_device + ) # used exclusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Sanity check original RoPE + original_rope = FalconRotaryEmbedding(config).to(torch_device) + original_cos_short, original_sin_short = original_rope(x, position_ids_short) + original_cos_long, original_sin_long = original_rope(x, position_ids_long) + torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) + + # Sanity check linear RoPE scaling + # New position "x" should match original position with index "x/scaling_factor" + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + linear_scaling_rope = FalconRotaryEmbedding(config).to(torch_device) + linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) + linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) + torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) + for new_position in range(0, long_input_length, scaling_factor): + original_position = int(new_position // scaling_factor) + torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) + torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) + + # Sanity check Dynamic NTK RoPE scaling + # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase + # with scaling_factor (or that `inv_freq` decreases) + config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} + ntk_scaling_rope = FalconRotaryEmbedding(config).to(torch_device) + ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) + ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) + torch.testing.assert_close(ntk_cos_short, original_cos_short) + torch.testing.assert_close(ntk_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_sin_long, original_sin_long) + self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) + + +@require_torch +class FalconLanguageGenerationTest(unittest.TestCase): + @slow + def test_lm_generate_falcon(self): + tokenizer = AutoTokenizer.from_pretrained("Rocketknight1/falcon-rw-1b") + model = FalconForCausalLM.from_pretrained("Rocketknight1/falcon-rw-1b") + model.eval() + model.to(torch_device) + inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device) + + EXPECTED_OUTPUT = ( + "My favorite food is pizza. I love it so much that I have a pizza party every year for my birthday." + ) + + output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=19) + output_str = tokenizer.batch_decode(output_ids)[0] + + self.assertEqual(output_str, EXPECTED_OUTPUT) + + @slow + @require_bitsandbytes + def test_lm_generate_falcon_11b(self): + tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-11B", padding_side="left") + model = FalconForCausalLM.from_pretrained( + "tiiuae/falcon-11B", device_map={"": torch_device}, load_in_8bit=True + ) + model.eval() + inputs = tokenizer( + "Two roads diverged in a yellow wood,", return_tensors="pt", return_token_type_ids=False + ).to(torch_device) + + EXPECTED_OUTPUT = "Two roads diverged in a yellow wood,\nAnd sorry I could not travel both\n" + + output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=9) + output_str = tokenizer.batch_decode(output_ids)[0] + + self.assertEqual(output_str, EXPECTED_OUTPUT) + + @slow + def test_lm_generation_big_models(self): + # The big models are way too big for the CI, so we use tiny random models that resemble their + # architectures but with much smaller and fewer layers + for repo in ["Rocketknight1/tiny-random-falcon-7b", "Rocketknight1/tiny-random-falcon-40b"]: + tokenizer = AutoTokenizer.from_pretrained(repo) + model = FalconForCausalLM.from_pretrained(repo) + model.eval() + model.to(torch_device) + inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device) + + # We just test that these run without errors - the models are randomly initialized + # and so the actual text outputs will be garbage + model.generate(**inputs, do_sample=False, max_new_tokens=4) + model.generate(**inputs, do_sample=True, max_new_tokens=4) + model.generate(**inputs, num_beams=2, max_new_tokens=4) + + @slow + def test_lm_generation_use_cache(self): + # The big models are way too big for the CI, so we use tiny random models that resemble their + # architectures but with much smaller and fewer layers + with torch.no_grad(): + for repo in [ + "Rocketknight1/falcon-rw-1b", + "Rocketknight1/tiny-random-falcon-7b", + "Rocketknight1/tiny-random-falcon-40b", + ]: + tokenizer = AutoTokenizer.from_pretrained(repo) + model = FalconForCausalLM.from_pretrained(repo) + model.eval() + model.to(device=torch_device) + inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device) + + # Test results are the same with and without cache + outputs_no_cache = model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False) + outputs_cache = model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=True) + self.assertTrue((outputs_cache - outputs_no_cache).sum().item() == 0) + + @require_bitsandbytes + @slow + def test_batched_generation(self): + tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", padding_side="left") + tokenizer.pad_token = tokenizer.eos_token + model = AutoModelForCausalLM.from_pretrained( + "tiiuae/falcon-7b", + device_map={"": torch_device}, + load_in_4bit=True, + ) + + test_text = "A sequence: 1, 2" # should generate the rest of the sequence + + unpadded_inputs = tokenizer([test_text], return_tensors="pt").to(f"{torch_device}:0") + unpadded_gen_out = model.generate(**unpadded_inputs, max_new_tokens=20) + unpadded_gen_text = tokenizer.batch_decode(unpadded_gen_out, skip_special_tokens=True) + + dummy_text = "This is a longer text " * 2 # forces left-padding on `test_text` + padded_inputs = tokenizer([test_text, dummy_text], return_tensors="pt", padding=True).to(f"{torch_device}:0") + padded_gen_out = model.generate(**padded_inputs, max_new_tokens=20) + padded_gen_text = tokenizer.batch_decode(padded_gen_out, skip_special_tokens=True) + + expected_output = "A sequence: 1, 2, 3, 4, 5, 6, 7, 8, " + self.assertLess(unpadded_inputs.input_ids.shape[-1], padded_inputs.input_ids.shape[-1]) # left-padding exists + self.assertEqual(unpadded_gen_text[0], expected_output) + self.assertEqual(padded_gen_text[0], expected_output) + + @slow + @require_torch_sdpa + def test_falcon_alibi_sdpa_matches_eager(self): + input_ids = torch.randint(0, 1000, (5, 20)) + + config = FalconConfig( + vocab_size=1000, + hidden_size=64, + num_hidden_layers=3, + num_attention_heads=4, + new_decoder_architecture=True, + alibi=True, + ) + + falcon = FalconForCausalLM(config) + falcon = falcon.eval() + + with torch.no_grad(): + # output_attentions=True dispatches to eager path + falcon_output_eager = falcon(input_ids, output_attentions=True)[0] + falcon_output_sdpa = falcon(input_ids)[0] + + torch.testing.assert_close(falcon_output_eager, falcon_output_sdpa, rtol=1e-3, atol=1e-3) diff --git a/docs/transformers/tests/models/falcon_mamba/__init__.py b/docs/transformers/tests/models/falcon_mamba/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/docs/transformers/tests/models/falcon_mamba/test_modeling_falcon_mamba.py new file mode 100644 index 0000000000000000000000000000000000000000..d34128ba067e98ca9ac1e318fc24136f0314c22e --- /dev/null +++ b/docs/transformers/tests/models/falcon_mamba/test_modeling_falcon_mamba.py @@ -0,0 +1,548 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +import unittest +from unittest.util import safe_repr + +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, FalconMambaConfig, is_torch_available +from transformers.testing_utils import ( + require_bitsandbytes, + require_torch, + require_torch_accelerator, + require_torch_multi_accelerator, + require_torch_multi_gpu, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + FalconMambaForCausalLM, + FalconMambaModel, + ) + from transformers.cache_utils import MambaCache + + +# Copied from transformers.tests.models.mamba.MambaModelTester with Mamba->FalconMamba,mamba->falcon_mamba +class FalconMambaModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + intermediate_size=32, + hidden_act="silu", + hidden_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + num_labels=3, + num_choices=4, + scope=None, + tie_word_embeddings=True, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + self.tie_word_embeddings = tie_word_embeddings + + # Ignore copy + def get_large_model_config(self): + return FalconMambaConfig.from_pretrained("tiiuae/falcon-mamba-7b") + + def prepare_config_and_inputs( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + attention_mask = ids_tensor([self.batch_size, self.seq_length], 1) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config( + gradient_checkpointing=gradient_checkpointing, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + ) + + return ( + config, + input_ids, + attention_mask, + sequence_labels, + token_labels, + choice_labels, + ) + + def get_config( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + return FalconMambaConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + intermediate_size=self.intermediate_size, + activation_function=self.hidden_act, + n_positions=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + use_cache=True, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + gradient_checkpointing=gradient_checkpointing, + tie_word_embeddings=self.tie_word_embeddings, + ) + + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + attention_mask, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + return ( + config, + input_ids, + attention_mask, + sequence_labels, + token_labels, + choice_labels, + ) + + def create_and_check_falcon_mamba_model(self, config, input_ids, *args): + config.output_hidden_states = True + model = FalconMambaModel(config=config) + model.to(torch_device) + model.eval() + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(len(result.hidden_states), config.num_hidden_layers + 1) + + def create_and_check_causal_lm(self, config, input_ids, *args): + model = FalconMambaForCausalLM(config) + model.to(torch_device) + model.eval() + + result = model(input_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_state_equivalency(self, config, input_ids, *args): + model = FalconMambaModel(config=config) + model.to(torch_device) + model.eval() + + outputs = model(input_ids) + output_whole = outputs.last_hidden_state + + outputs = model( + input_ids[:, :-1], + use_cache=True, + cache_position=torch.arange(0, config.conv_kernel, device=input_ids.device), + ) + output_one = outputs.last_hidden_state + + # Using the state computed on the first inputs, we will get the same output + outputs = model( + input_ids[:, -1:], + use_cache=True, + cache_params=outputs.cache_params, + cache_position=torch.arange(config.conv_kernel, config.conv_kernel + 1, device=input_ids.device), + ) + output_two = outputs.last_hidden_state + + self.parent.assertTrue(torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5)) + # TODO the original mamba does not support decoding more than 1 token neither do we + + def create_and_check_falcon_mamba_cached_slow_forward_and_backwards( + self, config, input_ids, *args, gradient_checkpointing=False + ): + model = FalconMambaModel(config) + model.to(torch_device) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + + # create cache + cache = model(input_ids, use_cache=True).cache_params + cache.reset() + + # use cache + token_emb = model.embeddings(input_ids) + outputs = model.layers[0].mixer.slow_forward( + token_emb, cache, cache_position=torch.arange(0, config.conv_kernel, device=input_ids.device) + ) + + loss = torch.log1p(torch.abs(outputs.sum())) + self.parent.assertEqual(loss.shape, ()) + self.parent.assertEqual(outputs.shape, (self.batch_size, self.seq_length, self.hidden_size)) + loss.backward() + + def create_and_check_falcon_mamba_lm_head_forward_and_backwards( + self, config, input_ids, *args, gradient_checkpointing=False + ): + model = FalconMambaForCausalLM(config) + model.to(torch_device) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + + result = model(input_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + result.loss.backward() + + def prepare_config_and_inputs_for_common(self): + ( + config, + input_ids, + attention_mask, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + return config, inputs_dict + + +@require_torch +# Copied from transformers.tests.models.mamba.MambaModelTest with Mamba->Falcon,mamba->falcon_mamba,FalconMambaCache->MambaCache +class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (FalconMambaModel, FalconMambaForCausalLM) if is_torch_available() else () + has_attentions = False # FalconMamba does not support attentions + fx_compatible = False # FIXME let's try to support this @ArthurZucker + test_torchscript = False # FIXME let's try to support this @ArthurZucker + test_missing_keys = False + test_model_parallel = False + test_pruning = False + test_head_masking = False # FalconMamba does not have attention heads + pipeline_model_mapping = ( + {"feature-extraction": FalconMambaModel, "text-generation": FalconMambaForCausalLM} + if is_torch_available() + else {} + ) + + def setUp(self): + self.model_tester = FalconMambaModelTester(self) + self.config_tester = ConfigTester( + self, config_class=FalconMambaConfig, n_embd=37, common_properties=["hidden_size", "num_hidden_layers"] + ) + + def assertInterval(self, member, container, msg=None): + r""" + Simple utility function to check if a member is inside an interval. + """ + if isinstance(member, torch.Tensor): + max_value, min_value = member.max().item(), member.min().item() + elif isinstance(member, list) or isinstance(member, tuple): + max_value, min_value = max(member), min(member) + + if not isinstance(container, list): + raise TypeError("container should be a list or tuple") + elif len(container) != 2: + raise ValueError("container should have 2 elements") + + expected_min, expected_max = container + + is_inside_interval = (min_value >= expected_min) and (max_value <= expected_max) + + if not is_inside_interval: + standardMsg = f"{safe_repr(member)} not found in {safe_repr(container)}" + self.fail(self._formatMessage(msg, standardMsg)) + + def test_config(self): + self.config_tester.run_common_tests() + + @require_torch_multi_gpu + def test_multi_gpu_data_parallel_forward(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # some params shouldn't be scattered by nn.DataParallel + # so just remove them if they are present. + blacklist_non_batched_params = ["cache_params"] + for k in blacklist_non_batched_params: + inputs_dict.pop(k, None) + + # move input tensors to cuda:O + for k, v in inputs_dict.items(): + if torch.is_tensor(v): + inputs_dict[k] = v.to(0) + + for model_class in self.all_model_classes: + model = model_class(config=config) + model.to(0) + model.eval() + + # Wrap model in nn.DataParallel + model = torch.nn.DataParallel(model) + with torch.no_grad(): + _ = model(**self._prepare_for_class(inputs_dict, model_class)) + + def test_falcon_mamba_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_falcon_mamba_model(*config_and_inputs) + + def test_falcon_mamba_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_causal_lm(*config_and_inputs) + + def test_state_equivalency(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_state_equivalency(*config_and_inputs) + + def test_falcon_mamba_cached_slow_forward_and_backwards(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_falcon_mamba_cached_slow_forward_and_backwards(*config_and_inputs) + + def test_falcon_mamba_lm_head_forward_and_backwards(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_falcon_mamba_lm_head_forward_and_backwards(*config_and_inputs) + + def test_initialization(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config=config) + for name, param in model.named_parameters(): + if "dt_proj.bias" in name: + dt = torch.exp( + torch.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min)) + + math.log(config.time_step_min) + ).clamp(min=config.time_step_floor) + inv_dt = dt + torch.log(-torch.expm1(-dt)) + if param.requires_grad: + self.assertTrue(param.data.max().item() <= inv_dt[1]) + self.assertTrue(param.data.min().item() >= inv_dt[0]) + elif "A_log" in name: + A = torch.arange(1, config.state_size + 1, dtype=torch.float32)[None, :] + A = A.expand(config.intermediate_size, -1).contiguous() + torch.testing.assert_close(param.data, torch.log(A), rtol=1e-5, atol=1e-5) + elif "D" in name: + if param.requires_grad: + # check if it's a ones like + torch.testing.assert_close(param.data, torch.ones_like(param.data), rtol=1e-5, atol=1e-5) + + @slow + # Ignore copy + def test_model_from_pretrained(self): + model = FalconMambaModel.from_pretrained( + "tiiuae/falcon-mamba-7b", torch_dtype=torch.float16, low_cpu_mem_usage=True + ) + self.assertIsNotNone(model) + + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, MambaCache): # MODIFIED PART START + recursive_check(tuple_object.conv_states, dict_object.conv_states) + recursive_check(tuple_object.ssm_states, dict_object.ssm_states) + elif isinstance(tuple_object, (list, tuple)): # MODIFIED PART END + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + torch.allclose(tuple_object, dict_object, atol=1e-5), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:" + f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has" + f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}." + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + +@require_torch +@require_torch_accelerator +@slow +class FalconMambaIntegrationTests(unittest.TestCase): + def setUp(self): + self.model_id = "tiiuae/falcon-mamba-7b" + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id) + self.text = "Hello today" + + def test_generation_bf16(self): + model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16, device_map="auto") + + inputs = self.tokenizer(self.text, return_tensors="pt").to(torch_device) + out = model.generate(**inputs, max_new_tokens=20, do_sample=False) + + self.assertEqual( + self.tokenizer.batch_decode(out, skip_special_tokens=False)[0], + "Hello today I am going to show you how to make a simple and easy to make paper plane.\nStep", + ) + + @require_bitsandbytes + def test_generation_4bit(self): + quantization_config = BitsAndBytesConfig(load_in_4bit=True) + model = AutoModelForCausalLM.from_pretrained(self.model_id, quantization_config=quantization_config) + + inputs = self.tokenizer(self.text, return_tensors="pt").to(torch_device) + out = model.generate(**inputs, max_new_tokens=20, do_sample=False) + + self.assertEqual( + self.tokenizer.batch_decode(out, skip_special_tokens=False)[0], + """Hello today I'm going to talk about the "C" in the "C-I-""", + ) + + def test_generation_torch_compile(self): + model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16).to(torch_device) + model = torch.compile(model) + + inputs = self.tokenizer(self.text, return_tensors="pt").to(torch_device) + out = model.generate(**inputs, max_new_tokens=20, do_sample=False) + + self.assertEqual( + self.tokenizer.batch_decode(out, skip_special_tokens=False)[0], + "Hello today I am going to show you how to make a simple and easy to make paper plane.\nStep", + ) + + def test_batched_generation(self): + model_id = "tiiuae/falcon-mamba-7b" + tok = AutoTokenizer.from_pretrained(model_id) + tok.pad_token_id = tok.eos_token_id + + texts = ["Hello today", "Hello my name is Younes and today"] + + EXPECTED_OUTPUT = [ + "Hello today I'm going to show you how to make a 3D model of a house.\n", + "Hello my name is Younes and today I will be talking about the topic of “The importance of the internet in our life”.\n", + ] + + inputs = tok(texts, return_tensors="pt", padding=True, return_token_type_ids=False).to(torch_device) + model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0, torch_dtype=torch.bfloat16) + + out = model.generate(**inputs, max_new_tokens=20) + out = tok.batch_decode(out, skip_special_tokens=True) + + self.assertListEqual(out, EXPECTED_OUTPUT) + + # We test the same generations with inputs_embeds + with torch.no_grad(): + inputs_embeds = model.get_input_embeddings()(inputs.pop("input_ids")) + + inputs["inputs_embeds"] = inputs_embeds + out = model.generate(**inputs, max_new_tokens=20) + out = tok.batch_decode(out, skip_special_tokens=True) + + self.assertListEqual(out, EXPECTED_OUTPUT) + + @require_torch_multi_accelerator + def test_training_kernel(self): + model_id = "tiiuae/falcon-mamba-7b" + + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16) + tokenizer.pad_token_id = tokenizer.eos_token_id + + text = "Hello today" + + inputs = tokenizer(text, return_tensors="pt").to(torch_device) + + with torch.no_grad(): + logits = torch.argmax(model(**inputs).logits, dim=-1) + + out_no_training = tokenizer.batch_decode(logits) + + model.train() + lm_logits = model(**inputs).logits + next_token = torch.argmax(lm_logits, dim=-1) + + out_training = tokenizer.batch_decode(next_token) + + # Just verify backward works + loss = (1 - lm_logits).mean() + loss.backward() + + self.assertEqual(out_training, out_no_training) diff --git a/docs/transformers/tests/models/fastspeech2_conformer/__init__.py b/docs/transformers/tests/models/fastspeech2_conformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py b/docs/transformers/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..0e0562da92de7344230959c6319c222c9cbece39 --- /dev/null +++ b/docs/transformers/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py @@ -0,0 +1,806 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch FastSpeech2Conformer model.""" + +import inspect +import tempfile +import unittest + +from transformers import ( + FastSpeech2ConformerConfig, + FastSpeech2ConformerHifiGanConfig, + FastSpeech2ConformerTokenizer, + FastSpeech2ConformerWithHifiGanConfig, + is_torch_available, +) +from transformers.testing_utils import require_g2p_en, require_torch, require_torch_accelerator, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor + + +if is_torch_available(): + import torch + + from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerWithHifiGan, set_seed + + +class FastSpeech2ConformerModelTester: + def __init__( + self, + parent, + batch_size=13, + num_hidden_layers=1, + num_attention_heads=2, + hidden_size=24, + seq_length=7, + encoder_linear_units=384, + decoder_linear_units=384, + is_training=False, + speech_decoder_postnet_units=128, + speech_decoder_postnet_layers=2, + pitch_predictor_layers=1, + energy_predictor_layers=1, + duration_predictor_layers=1, + num_mel_bins=8, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.vocab_size = hidden_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.encoder_linear_units = encoder_linear_units + self.decoder_linear_units = decoder_linear_units + self.speech_decoder_postnet_units = speech_decoder_postnet_units + self.speech_decoder_postnet_layers = speech_decoder_postnet_layers + self.pitch_predictor_layers = pitch_predictor_layers + self.energy_predictor_layers = energy_predictor_layers + self.duration_predictor_layers = duration_predictor_layers + self.num_mel_bins = num_mel_bins + + def prepare_config_and_inputs(self): + config = self.get_config() + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + return config, input_ids + + def get_config(self): + return FastSpeech2ConformerConfig( + hidden_size=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_linear_units=self.encoder_linear_units, + decoder_linear_units=self.decoder_linear_units, + speech_decoder_postnet_units=self.speech_decoder_postnet_units, + speech_decoder_postnet_layers=self.speech_decoder_postnet_layers, + num_mel_bins=self.num_mel_bins, + pitch_predictor_layers=self.pitch_predictor_layers, + energy_predictor_layers=self.energy_predictor_layers, + duration_predictor_layers=self.duration_predictor_layers, + ) + + def create_and_check_model(self, config, input_ids, *args): + model = FastSpeech2ConformerModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, return_dict=True) + + # total of 5 keys in result + self.parent.assertEqual(len(result), 5) + # check batch sizes match + for value in result.values(): + self.parent.assertEqual(value.size(0), self.batch_size) + # check duration, pitch, and energy have the appropriate shapes + # duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1) + self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape) + self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape) + # check predicted mel-spectrogram has correct dimension + self.parent.assertEqual(result["spectrogram"].size(2), model.config.num_mel_bins) + + def prepare_config_and_inputs_for_common(self): + config, input_ids = self.prepare_config_and_inputs() + inputs_dict = {"input_ids": input_ids} + return config, inputs_dict + + +@require_torch_accelerator +@require_torch +class FastSpeech2ConformerModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (FastSpeech2ConformerModel,) if is_torch_available() else () + test_pruning = False + test_headmasking = False + test_torchscript = False + test_resize_embeddings = False + is_encoder_decoder = True + + def setUp(self): + self.model_tester = FastSpeech2ConformerModelTester(self) + self.config_tester = ConfigTester(self, config_class=FastSpeech2ConformerConfig) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_initialization(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + msg = f"Parameter {name} of model {model_class} seems not properly initialized" + if "norm" in name: + if "bias" in name: + self.assertEqual(param.data.mean().item(), 0.0, msg=msg) + if "weight" in name: + self.assertEqual(param.data.mean().item(), 1.0, msg=msg) + elif "conv" in name or "embed" in name: + self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg) + + def test_duration_energy_pitch_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_len = self.model_tester.seq_length + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + # duration + self.assertListEqual(list(outputs.duration_outputs.shape), [self.model_tester.batch_size, seq_len]) + # energy + self.assertListEqual(list(outputs.energy_outputs.shape), [self.model_tester.batch_size, seq_len, 1]) + # pitch + self.assertListEqual(list(outputs.pitch_outputs.shape), [self.model_tester.batch_size, seq_len, 1]) + + def test_hidden_states_output(self): + def _check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + for idx, hidden_states in enumerate([outputs.encoder_hidden_states, outputs.decoder_hidden_states]): + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + + self.assertEqual(len(hidden_states), expected_num_layers) + self.assertIsInstance(hidden_states, (list, tuple)) + expected_batch_size, expected_seq_length, expected_hidden_size = hidden_states[0].shape + self.assertEqual(expected_batch_size, self.model_tester.batch_size) + # Only test encoder seq_length since decoder seq_length is variable based on inputs + if idx == 0: + self.assertEqual(expected_seq_length, self.model_tester.seq_length) + self.assertEqual(expected_hidden_size, self.model_tester.hidden_size) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + inputs_dict["output_hidden_states"] = True + _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerModel) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerModel) + + def test_save_load_strict(self): + config, _ = self.model_tester.prepare_config_and_inputs() + model = FastSpeech2ConformerModel(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + _, info = FastSpeech2ConformerModel.from_pretrained(tmpdirname, output_loading_info=True) + self.assertEqual(info["missing_keys"], []) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + model = FastSpeech2ConformerModel(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = [ + "input_ids", + "attention_mask", + "spectrogram_labels", + "duration_labels", + "pitch_labels", + "energy_labels", + "speaker_ids", + "lang_ids", + "speaker_embedding", + "return_dict", + "output_attentions", + "output_hidden_states", + ] + self.assertListEqual(arg_names, expected_arg_names) + + # Override as FastSpeech2Conformer does not output cross attentions + def test_retain_grad_hidden_states_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + model = FastSpeech2ConformerModel(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, FastSpeech2ConformerModel) + + outputs = model(**inputs) + + output = outputs[0] + + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_hidden_states.retain_grad() + + decoder_hidden_states = outputs.decoder_hidden_states[0] + decoder_hidden_states.retain_grad() + + encoder_attentions = outputs.encoder_attentions[0] + encoder_attentions.retain_grad() + + decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(decoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_attentions.grad) + + def test_attention_outputs(self): + """ + Custom `test_attention_outputs` since FastSpeech2Conformer does not output cross attentions, has variable + decoder attention shape, and uniquely outputs energy, pitch, and durations. + """ + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_len = self.model_tester.seq_length + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(len(outputs.encoder_attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + encoder_attentions = outputs.encoder_attentions + self.assertEqual(len(encoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(encoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + out_len = len(outputs) + + correct_outlen = 7 + self.assertEqual(out_len, correct_outlen) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + added_hidden_states = 2 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + + @slow + def test_model_from_pretrained(self): + model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer") + self.assertIsNotNone(model) + + @unittest.skip(reason="FastSpeech2Conformer does not accept inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="FastSpeech2Conformer has no input embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip( + "FastSpeech2Conformer predicts durations in linear domain during inference" + "Even small differences on hidden states lead to different durations, due to `torch.round`" + ) + def test_batching_equivalence(self): + pass + + +@require_torch +@require_g2p_en +@slow +class FastSpeech2ConformerModelIntegrationTest(unittest.TestCase): + def test_inference_integration(self): + model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer") + model.to(torch_device) + model.eval() + + tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer") + text = "Test that this generates speech" + input_ids = tokenizer(text, return_tensors="pt").to(torch_device)["input_ids"] + + outputs_dict = model(input_ids) + spectrogram = outputs_dict["spectrogram"] + + # mel-spectrogram is too large (1, 205, 80), so only check top-left 100 elements + # fmt: off + expected_mel_spectrogram = torch.tensor( + [ + [-1.2426, -1.7286, -1.6754, -1.7451, -1.6402, -1.5219, -1.4480, -1.3345, -1.4031, -1.4497], + [-0.7858, -1.4966, -1.3602, -1.4876, -1.2949, -1.0723, -1.0021, -0.7553, -0.6521, -0.6929], + [-0.7298, -1.3908, -1.0369, -1.2656, -1.0342, -0.7883, -0.7420, -0.5249, -0.3734, -0.3977], + [-0.4784, -1.3508, -1.1558, -1.4678, -1.2820, -1.0252, -1.0868, -0.9006, -0.8947, -0.8448], + [-0.3963, -1.2895, -1.2813, -1.6147, -1.4658, -1.2560, -1.4134, -1.2650, -1.3255, -1.1715], + [-1.4914, -1.3097, -0.3821, -0.3898, -0.5748, -0.9040, -1.0755, -1.0575, -1.2205, -1.0572], + [0.0197, -0.0582, 0.9147, 1.1512, 1.1651, 0.6628, -0.1010, -0.3085, -0.2285, 0.2650], + [1.1780, 0.1803, 0.7251, 1.5728, 1.6678, 0.4542, -0.1572, -0.1787, 0.0744, 0.8168], + [-0.2078, -0.3211, 1.1096, 1.5085, 1.4632, 0.6299, -0.0515, 0.0589, 0.8609, 1.4429], + [0.7831, -0.2663, 1.0352, 1.4489, 0.9088, 0.0247, -0.3995, 0.0078, 1.2446, 1.6998], + ], + device=torch_device, + ) + # fmt: on + + torch.testing.assert_close(spectrogram[0, :10, :10], expected_mel_spectrogram, rtol=1e-4, atol=1e-4) + self.assertEqual(spectrogram.shape, (1, 205, model.config.num_mel_bins)) + + def test_training_integration(self): + model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer") + model.to(torch_device) + # Set self.training manually to keep deterministic but run the training path + model.training = True + set_seed(0) + + tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer") + text = "Test that this generates speech" + input_ids = tokenizer(text, return_tensors="pt").to(torch_device)["input_ids"] + + # NOTE: Dummy numbers since FastSpeech2Conformer does not have a feature extractor due to the package deps required (librosa, MFA) + batch_size, max_text_len = input_ids.shape + pitch_labels = torch.rand((batch_size, max_text_len, 1), dtype=torch.float, device=torch_device) + energy_labels = torch.rand((batch_size, max_text_len, 1), dtype=torch.float, device=torch_device) + duration_labels = torch.normal(10, 2, size=(batch_size, max_text_len)).clamp(1, 20).int() + max_target_len, _ = duration_labels.sum(dim=1).max(dim=0) + max_target_len = max_target_len.item() + spectrogram_labels = torch.rand( + (batch_size, max_target_len, model.num_mel_bins), dtype=torch.float, device=torch_device + ) + + outputs_dict = model( + input_ids, + spectrogram_labels=spectrogram_labels, + duration_labels=duration_labels, + pitch_labels=pitch_labels, + energy_labels=energy_labels, + return_dict=True, + ) + spectrogram = outputs_dict["spectrogram"] + loss = outputs_dict["loss"] + + # # mel-spectrogram is too large (1, 224, 80), so only check top-left 100 elements + # fmt: off + expected_mel_spectrogram = torch.tensor( + [ + [-1.0643e+00, -6.8058e-01, -1.0901e+00, -8.2724e-01, -7.7241e-01, -1.1905e+00, -8.5725e-01, -8.2930e-01, -1.1313e+00, -1.2449e+00], + [-5.5067e-01, -2.7045e-01, -6.3483e-01, -1.9320e-01, 1.0234e-01, -3.3253e-01, -2.4423e-01, -3.5045e-01, -5.2070e-01, -4.3710e-01], + [ 2.2181e-01, 3.1433e-01, -1.2849e-01, 6.0253e-01, 1.0033e+00, 1.3952e-01, 1.2851e-01, -2.3063e-02, -1.5092e-01, 2.4903e-01], + [ 4.6343e-01, 4.1820e-01, 1.6468e-01, 1.1297e+00, 1.4588e+00, 1.3737e-01, 6.6355e-02, -6.0973e-02, -5.4225e-02, 5.9208e-01], + [ 5.2762e-01, 4.8725e-01, 4.2735e-01, 1.4392e+00, 1.7398e+00, 2.4891e-01, -8.4531e-03, -8.1282e-02, 1.2857e-01, 8.7559e-01], + [ 5.2548e-01, 5.1653e-01, 5.2034e-01, 1.3782e+00, 1.5972e+00, 1.6380e-01, -5.1807e-02, 1.5474e-03, 2.2824e-01, 8.5288e-01], + [ 3.6356e-01, 4.4109e-01, 4.4257e-01, 9.4273e-01, 1.1201e+00, -9.0551e-03, -1.1627e-01, -2.0821e-02, 1.0793e-01, 5.0336e-01], + [ 3.6598e-01, 3.2708e-01, 1.3297e-01, 4.5162e-01, 6.4168e-01, -2.6923e-01, -2.3101e-01, -1.4943e-01, -1.4732e-01, 7.3057e-02], + [ 2.7639e-01, 2.2588e-01, -1.5310e-01, 1.0957e-01, 3.3048e-01, -5.3431e-01, -3.3822e-01, -2.8007e-01, -3.3823e-01, -1.5775e-01], + [ 2.9323e-01, 1.6723e-01, -3.4153e-01, -1.1209e-01, 1.7355e-01, -6.1724e-01, -5.4201e-01, -4.9944e-01, -5.2212e-01, -2.7596e-01] + ], + device=torch_device, + ) + # fmt: on + + expected_loss = torch.tensor(74.4595, device=torch_device) + + torch.testing.assert_close(spectrogram[0, :10, :10], expected_mel_spectrogram, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(loss, expected_loss, rtol=1e-4, atol=1e-4) + self.assertEqual(spectrogram.shape, (1, 224, model.config.num_mel_bins)) + + +class FastSpeech2ConformerWithHifiGanTester: + def __init__( + self, + parent, + batch_size=13, + num_hidden_layers=1, + num_attention_heads=2, + hidden_size=24, + seq_length=7, + encoder_linear_units=384, + decoder_linear_units=384, + is_training=False, + speech_decoder_postnet_units=128, + speech_decoder_postnet_layers=2, + pitch_predictor_layers=1, + energy_predictor_layers=1, + duration_predictor_layers=1, + num_mel_bins=8, + upsample_initial_channel=64, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.vocab_size = hidden_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.encoder_linear_units = encoder_linear_units + self.decoder_linear_units = decoder_linear_units + self.speech_decoder_postnet_units = speech_decoder_postnet_units + self.speech_decoder_postnet_layers = speech_decoder_postnet_layers + self.pitch_predictor_layers = pitch_predictor_layers + self.energy_predictor_layers = energy_predictor_layers + self.duration_predictor_layers = duration_predictor_layers + self.num_mel_bins = num_mel_bins + self.upsample_initial_channel = upsample_initial_channel + + def prepare_config_and_inputs(self): + config = self.get_config() + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + return config, input_ids + + def get_config(self): + self.model_config = FastSpeech2ConformerConfig( + hidden_size=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_linear_units=self.encoder_linear_units, + decoder_linear_units=self.decoder_linear_units, + speech_decoder_postnet_units=self.speech_decoder_postnet_units, + speech_decoder_postnet_layers=self.speech_decoder_postnet_layers, + num_mel_bins=self.num_mel_bins, + pitch_predictor_layers=self.pitch_predictor_layers, + energy_predictor_layers=self.energy_predictor_layers, + duration_predictor_layers=self.duration_predictor_layers, + ) + self.vocoder_config = FastSpeech2ConformerHifiGanConfig( + model_in_dim=self.num_mel_bins, upsample_initial_channel=self.upsample_initial_channel + ) + return FastSpeech2ConformerWithHifiGanConfig( + model_config=self.model_config.to_dict(), vocoder_config=self.vocoder_config.to_dict() + ) + + def create_and_check_model(self, config, input_ids, *args): + model = FastSpeech2ConformerWithHifiGan(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, return_dict=True) + + # total of 5 keys in result + self.parent.assertEqual(len(result), 6) + # check batch sizes match + for value in result.values(): + self.parent.assertEqual(value.size(0), self.batch_size) + # check duration, pitch, and energy have the appropriate shapes + # duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1) + self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape) + self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape) + # check predicted mel-spectrogram has correct dimension + self.parent.assertEqual(result["spectrogram"].size(2), model.config.model_config.num_mel_bins) + + def prepare_config_and_inputs_for_common(self): + config, input_ids = self.prepare_config_and_inputs() + inputs_dict = {"input_ids": input_ids} + return config, inputs_dict + + +@require_torch_accelerator +@require_torch +class FastSpeech2ConformerWithHifiGanTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (FastSpeech2ConformerWithHifiGan,) if is_torch_available() else () + test_pruning = False + test_headmasking = False + test_torchscript = False + test_resize_embeddings = False + is_encoder_decoder = True + + def setUp(self): + self.model_tester = FastSpeech2ConformerWithHifiGanTester(self) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_initialization(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + msg = f"Parameter {name} of model {model_class} seems not properly initialized" + if "norm" in name: + if "bias" in name: + self.assertEqual(param.data.mean().item(), 0.0, msg=msg) + if "weight" in name: + self.assertEqual(param.data.mean().item(), 1.0, msg=msg) + elif "conv" in name or "embed" in name: + self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg) + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + return inputs_dict + + def test_duration_energy_pitch_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.model_config.return_dict = True + + seq_len = self.model_tester.seq_length + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + # duration + self.assertListEqual(list(outputs.duration_outputs.shape), [self.model_tester.batch_size, seq_len]) + # energy + self.assertListEqual(list(outputs.energy_outputs.shape), [self.model_tester.batch_size, seq_len, 1]) + # pitch + self.assertListEqual(list(outputs.pitch_outputs.shape), [self.model_tester.batch_size, seq_len, 1]) + + def test_hidden_states_output(self): + def _check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + for idx, hidden_states in enumerate([outputs.encoder_hidden_states, outputs.decoder_hidden_states]): + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + + self.assertEqual(len(hidden_states), expected_num_layers) + self.assertIsInstance(hidden_states, (list, tuple)) + expected_batch_size, expected_seq_length, expected_hidden_size = hidden_states[0].shape + self.assertEqual(expected_batch_size, self.model_tester.batch_size) + # Only test encoder seq_length since decoder seq_length is variable based on inputs + if idx == 0: + self.assertEqual(expected_seq_length, self.model_tester.seq_length) + self.assertEqual(expected_hidden_size, self.model_tester.hidden_size) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + inputs_dict["output_hidden_states"] = True + _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerWithHifiGan) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.model_config.output_hidden_states = True + + _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerWithHifiGan) + + def test_save_load_strict(self): + config, _ = self.model_tester.prepare_config_and_inputs() + model = FastSpeech2ConformerWithHifiGan(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + _, info = FastSpeech2ConformerWithHifiGan.from_pretrained(tmpdirname, output_loading_info=True) + self.assertEqual(info["missing_keys"], []) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + model = FastSpeech2ConformerWithHifiGan(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = [ + "input_ids", + "attention_mask", + "spectrogram_labels", + "duration_labels", + "pitch_labels", + "energy_labels", + "speaker_ids", + "lang_ids", + "speaker_embedding", + "return_dict", + "output_attentions", + "output_hidden_states", + ] + self.assertListEqual(arg_names, expected_arg_names) + + # Override as FastSpeech2Conformer does not output cross attentions + def test_retain_grad_hidden_states_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.model_config.output_hidden_states = True + config.model_config.output_attentions = True + + model = FastSpeech2ConformerWithHifiGan(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, FastSpeech2ConformerModel) + + outputs = model(**inputs) + + output = outputs[0] + + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_hidden_states.retain_grad() + + decoder_hidden_states = outputs.decoder_hidden_states[0] + decoder_hidden_states.retain_grad() + + encoder_attentions = outputs.encoder_attentions[0] + encoder_attentions.retain_grad() + + decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(decoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_attentions.grad) + + def test_attention_outputs(self): + """ + Custom `test_attention_outputs` since FastSpeech2Conformer does not output cross attentions, has variable + decoder attention shape, and uniquely outputs energy, pitch, and durations. + """ + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.model_config.return_dict = True + + seq_len = self.model_tester.seq_length + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.model_config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(len(outputs.encoder_attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.model_config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + encoder_attentions = outputs.encoder_attentions + self.assertEqual(len(encoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(encoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + out_len = len(outputs) + + correct_outlen = 8 + self.assertEqual(out_len, correct_outlen) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + added_hidden_states = 2 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.encoder_attentions + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + + @slow + def test_model_from_pretrained(self): + model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer") + self.assertIsNotNone(model) + + @unittest.skip(reason="FastSpeech2Conformer does not accept inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="FastSpeech2Conformer has no input embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip( + "FastSpeech2Conformer predicts durations in linear domain during inference" + "Even small differences on hidden states lead to different durations, due to `torch.round`" + ) + def test_batching_equivalence(self): + pass + + +@require_torch +@require_g2p_en +@slow +class FastSpeech2ConformerWithHifiGanIntegrationTest(unittest.TestCase): + def test_inference_integration(self): + model = FastSpeech2ConformerWithHifiGan.from_pretrained("espnet/fastspeech2_conformer_with_hifigan") + model.to(torch_device) + model.eval() + + tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer") + text = "Test that this generates speech" + input_ids = tokenizer(text, return_tensors="pt").to(torch_device)["input_ids"] + + output = model(input_ids) + waveform = output.waveform + + # waveform is too large (1, 52480), so only check first 100 elements + # fmt: off + expected_waveform = torch.tensor( + [ + [-9.6345e-04, 1.3557e-03, 5.7559e-04, 2.4706e-04, 2.2675e-04, 1.2258e-04, 4.7784e-04, 1.0109e-03, -1.9718e-04, 6.3495e-04, 3.2106e-04, 6.3620e-05, 9.1713e-04, -2.5664e-05, 1.9596e-04, 6.0418e-04, 8.1112e-04, 3.6342e-04, -6.3396e-04, -2.0146e-04, -1.1768e-04, 4.3155e-04, 7.5599e-04, -2.2972e-04, -9.5665e-05, 3.3078e-04, 1.3793e-04, -1.4932e-04, -3.9645e-04, 3.6473e-05, -1.7224e-04, -4.5370e-05, -4.8950e-04, -4.3059e-04, 1.0451e-04, -1.0485e-03, -6.0410e-04, 1.6990e-04, -2.1997e-04, -3.8769e-04, -7.6898e-04, -3.2372e-04, -1.9783e-04, 5.2896e-05, -1.0586e-03, -7.8516e-04, 7.6867e-04, -8.5331e-05, -4.8158e-04, -4.5362e-05, -1.0770e-04, 6.6823e-04, 3.0765e-04, 3.3669e-04, 9.5677e-04, 1.0458e-03, 5.8129e-04, 3.3737e-04, 1.0816e-03, 7.0346e-04, 4.2378e-04, 4.3131e-04, 2.8095e-04, 1.2201e-03, 5.6121e-04, -1.1086e-04, 4.9908e-04, 1.5586e-04, 4.2046e-04, -2.8088e-04, -2.2462e-04, -1.5539e-04, -7.0126e-04, -2.8577e-04, -3.3693e-04, -1.2471e-04, -6.9104e-04, -1.2867e-03, -6.2651e-04, -2.5586e-04, -1.3201e-04, -9.4537e-04, -4.8438e-04, 4.1458e-04, 6.4109e-04, 1.0891e-04, -6.3764e-04, 4.5573e-04, 8.2974e-04, 3.2973e-06, -3.8274e-04, -2.0400e-04, 4.9922e-04, 2.1508e-04, -1.1009e-04, -3.9763e-05, 3.0576e-04, 3.1485e-05, -2.7574e-05, 3.3856e-04], + ], + device=torch_device, + ) + # fmt: on + + torch.testing.assert_close(waveform[0, :100], expected_waveform, rtol=1e-4, atol=1e-4) + self.assertEqual(waveform.shape, (1, 52480)) diff --git a/docs/transformers/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py b/docs/transformers/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..d42a774d7c8b1040ec99633164a14c35d3e23858 --- /dev/null +++ b/docs/transformers/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py @@ -0,0 +1,191 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for the FastSpeech2Conformer tokenizer.""" + +import unittest + +from transformers.models.fastspeech2_conformer import FastSpeech2ConformerTokenizer +from transformers.testing_utils import require_g2p_en, slow + +from ...test_tokenization_common import TokenizerTesterMixin + + +@require_g2p_en +class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "espnet/fastspeech2_conformer" + tokenizer_class = FastSpeech2ConformerTokenizer + test_rust_tokenizer = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer") + tokenizer.save_pretrained(cls.tmpdirname) + + def get_input_output_texts(self, tokenizer): + input_text = "this is a test" + output_text = "this is a test" + return input_text, output_text + + # Custom `get_clean_sequence` since FastSpeech2ConformerTokenizer can't decode id -> string + def get_clean_sequence(self, tokenizer, with_prefix_space=False, **kwargs): # max_length=20, min_length=5 + input_text, output_text = self.get_input_output_texts(tokenizer) + ids = tokenizer.encode(output_text, add_special_tokens=False) + return output_text, ids + + def test_convert_token_and_id(self): + """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" + token = "" + token_id = 1 + + self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id) + self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token) + + def test_get_vocab(self): + vocab_keys = list(self.get_tokenizer().get_vocab().keys()) + + self.assertEqual(vocab_keys[0], "") + self.assertEqual(vocab_keys[1], "") + self.assertEqual(vocab_keys[-4], "UH0") + self.assertEqual(vocab_keys[-2], "..") + self.assertEqual(vocab_keys[-1], "") + self.assertEqual(len(vocab_keys), 78) + + def test_vocab_size(self): + self.assertEqual(self.get_tokenizer().vocab_size, 78) + + @unittest.skip( + "FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend" + ) + def test_added_token_are_matched_longest_first(self): + pass + + @unittest.skip( + "FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend" + ) + def test_added_tokens_do_lower_case(self): + pass + + @unittest.skip( + "FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend" + ) + def test_tokenize_special_tokens(self): + pass + + def test_full_tokenizer(self): + tokenizer = self.get_tokenizer() + + tokens = tokenizer.tokenize("This is a test") + ids = [9, 12, 6, 12, 11, 2, 4, 15, 6, 4, 77] + self.assertListEqual(tokens, ["DH", "IH1", "S", "IH1", "Z", "AH0", "T", "EH1", "S", "T", ""]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), ids) + self.assertListEqual(tokenizer.convert_ids_to_tokens(ids), tokens) + + @slow + def test_tokenizer_integration(self): + # Custom test since: + # 1) This tokenizer only decodes to tokens (phonemes cannot be converted to text with complete accuracy) + # 2) Uses a sequence without numbers since espnet has different, custom number conversion. + # This tokenizer can phonemize numbers, but where in espnet "32" is phonemized as "thirty two", + # here "32" is phonemized as "thirty-two" because we haven't implemented the custom number handling. + + sequences = [ + "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides " + "general-purpose architectures (BERT, GPT, RoBERTa, XLM, DistilBert, XLNet...) for Natural " + "Language Understanding (NLU) and Natural Language Generation (NLG) with over thirty-two pretrained " + "models in one hundred plus languages and deep interoperability between Jax, PyTorch and TensorFlow.", + "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly " + "conditioning on both left and right context in all layers.", + "The quick brown fox jumps over the lazy dog.", + ] + tokenizer = FastSpeech2ConformerTokenizer.from_pretrained( + "espnet/fastspeech2_conformer", revision="07f9c4a2d6bbc69b277d87d2202ad1e35b05e113" + ) + actual_encoding = tokenizer(sequences) + + # fmt: off + expected_encoding = { + 'input_ids': [ + [4, 7, 60, 3, 6, 22, 30, 7, 14, 21, 11, 22, 30, 7, 14, 21, 8, 29, 3, 34, 3, 18, 11, 17, 12, 4, 21, 10, 4, 7, 60, 3, 6, 22, 30, 7, 14, 21, 11, 2, 3, 5, 17, 12, 4, 21, 10, 17, 7, 29, 4, 7, 31, 3, 5, 25, 38, 4, 17, 7, 2, 20, 32, 5, 11, 40, 15, 3, 21, 2, 8, 17, 38, 17, 2, 6, 24, 7, 10, 2, 4, 45, 10, 39, 21, 11, 25, 38, 4, 23, 37, 15, 4, 6, 23, 7, 2, 25, 38, 4, 2, 23, 11, 8, 15, 14, 11, 23, 5, 13, 6, 4, 12, 8, 4, 21, 25, 23, 11, 8, 15, 3, 39, 2, 8, 1, 22, 30, 7, 3, 18, 39, 21, 2, 8, 8, 18, 36, 37, 16, 2, 40, 62, 3, 5, 21, 6, 4, 18, 3, 5, 13, 36, 3, 8, 28, 2, 3, 5, 3, 18, 39, 21, 2, 8, 8, 18, 36, 37, 16, 2, 40, 40, 45, 3, 21, 31, 35, 2, 3, 15, 8, 36, 16, 12, 9, 34, 20, 21, 43, 38, 5, 29, 4, 28, 17, 7, 29, 4, 7, 31, 3, 5, 14, 24, 5, 2, 8, 11, 13, 3, 16, 19, 3, 26, 19, 3, 5, 7, 2, 5, 17, 8, 19, 6, 8, 18, 36, 37, 16, 2, 40, 2, 11, 2, 3, 5, 5, 27, 17, 49, 3, 4, 21, 2, 17, 21, 25, 12, 8, 2, 4, 29, 25, 13, 4, 16, 27, 3, 40, 18, 10, 6, 23, 17, 12, 4, 21, 10, 2, 3, 5, 4, 15, 3, 6, 21, 8, 46, 22, 33, 77], + [25, 38, 4, 12, 11, 5, 13, 11, 32, 3, 5, 4, 28, 17, 7, 27, 4, 7, 31, 3, 5, 27, 17, 25, 51, 5, 13, 7, 15, 10, 35, 2, 3, 2, 8, 7, 45, 17, 7, 2, 11, 2, 3, 4, 31, 35, 2, 3, 11, 22, 7, 19, 14, 2, 3, 8, 31, 25, 2, 8, 5, 4, 15, 10, 6, 4, 25, 32, 40, 55, 3, 4, 8, 29, 10, 2, 3, 5, 12, 35, 2, 3, 13, 36, 24, 3, 25, 34, 43, 8, 15, 22, 4, 2, 3, 5, 7, 32, 4, 10, 24, 3, 4, 54, 10, 6, 4, 13, 3, 30, 8, 8, 31, 21, 11, 33, 77], + [9, 2, 10, 16, 12, 10, 25, 7, 42, 3, 22, 24, 10, 6, 40, 19, 14, 17, 6, 34, 20, 21, 9, 2, 8, 31, 11, 29, 5, 30, 37, 33, 77] + ], + 'attention_mask': [ + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + ] + } + # fmt: on + + actual_tokens = [tokenizer.decode(input_ids) for input_ids in expected_encoding["input_ids"]] + expected_tokens = [ + [tokenizer.convert_ids_to_tokens(id) for id in sequence] for sequence in expected_encoding["input_ids"] + ] + + self.assertListEqual(actual_encoding["input_ids"], expected_encoding["input_ids"]) + self.assertListEqual(actual_encoding["attention_mask"], expected_encoding["attention_mask"]) + self.assertTrue(actual_tokens == expected_tokens) + + @unittest.skip( + reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend" + ) + def test_add_tokens_tokenizer(self): + pass + + @unittest.skip( + reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend" + ) + def test_add_special_tokens(self): + pass + + @unittest.skip( + reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend" + ) + def test_added_token_serializable(self): + pass + + @unittest.skip( + reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend" + ) + def test_save_and_load_tokenizer(self): + pass + + @unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping") + def test_internal_consistency(self): + pass + + @unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping") + def test_encode_decode_with_spaces(self): + pass + + @unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping") + def test_convert_tokens_to_string_format(self): + pass + + @unittest.skip(reason="FastSpeech2Conformer tokenizer does not support pairs.") + def test_maximum_encoding_length_pair_input(self): + pass + + @unittest.skip( + "FastSpeech2Conformer tokenizer appends eos_token to each string it's passed, including `is_split_into_words=True`." + ) + def test_pretokenized_inputs(self): + pass + + @unittest.skip( + reason="g2p_en is slow is with large inputs and max encoding length is not a concern for FastSpeech2Conformer" + ) + def test_maximum_encoding_length_single_input(self): + pass diff --git a/docs/transformers/tests/models/flaubert/__init__.py b/docs/transformers/tests/models/flaubert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/flaubert/test_modeling_flaubert.py b/docs/transformers/tests/models/flaubert/test_modeling_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..f98773a1199b7468cea38931f2a4924e47543637 --- /dev/null +++ b/docs/transformers/tests/models/flaubert/test_modeling_flaubert.py @@ -0,0 +1,518 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import tempfile +import unittest + +from transformers import FlaubertConfig, is_sacremoses_available, is_torch_available +from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + FlaubertForMultipleChoice, + FlaubertForQuestionAnswering, + FlaubertForQuestionAnsweringSimple, + FlaubertForSequenceClassification, + FlaubertForTokenClassification, + FlaubertModel, + FlaubertWithLMHeadModel, + ) + from transformers.models.flaubert.modeling_flaubert import create_sinusoidal_embeddings + + +class FlaubertModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_lengths=True, + use_token_type_ids=True, + use_labels=True, + gelu_activation=True, + sinusoidal_embeddings=False, + causal=False, + asm=False, + n_langs=2, + vocab_size=99, + n_special=0, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=12, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + summary_type="last", + use_proj=None, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_lengths = use_input_lengths + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.gelu_activation = gelu_activation + self.sinusoidal_embeddings = sinusoidal_embeddings + self.causal = causal + self.asm = asm + self.n_langs = n_langs + self.vocab_size = vocab_size + self.n_special = n_special + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.summary_type = summary_type + self.use_proj = use_proj + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + input_lengths = None + if self.use_input_lengths: + input_lengths = ( + ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2 + ) # small variation of seq_length + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs) + + sequence_labels = None + token_labels = None + is_impossible_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + is_impossible_labels = ids_tensor([self.batch_size], 2).float() + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ) + + def get_config(self): + return FlaubertConfig( + vocab_size=self.vocab_size, + n_special=self.n_special, + emb_dim=self.hidden_size, + n_layers=self.num_hidden_layers, + n_heads=self.num_attention_heads, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + gelu_activation=self.gelu_activation, + sinusoidal_embeddings=self.sinusoidal_embeddings, + asm=self.asm, + causal=self.causal, + n_langs=self.n_langs, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + summary_type=self.summary_type, + use_proj=self.use_proj, + ) + + def create_and_check_flaubert_model( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = FlaubertModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, lengths=input_lengths, langs=token_type_ids) + result = model(input_ids, langs=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_flaubert_lm_head( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = FlaubertWithLMHeadModel(config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_flaubert_simple_qa( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = FlaubertForQuestionAnsweringSimple(config) + model.to(torch_device) + model.eval() + + result = model(input_ids) + + result = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_flaubert_qa( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = FlaubertForQuestionAnswering(config) + model.to(torch_device) + model.eval() + + result = model(input_ids) + + result_with_labels = model( + input_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + cls_index=sequence_labels, + is_impossible=is_impossible_labels, + p_mask=input_mask, + ) + + result_with_labels = model( + input_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + cls_index=sequence_labels, + is_impossible=is_impossible_labels, + ) + + (total_loss,) = result_with_labels.to_tuple() + + result_with_labels = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) + + (total_loss,) = result_with_labels.to_tuple() + + self.parent.assertEqual(result_with_labels.loss.shape, ()) + self.parent.assertEqual(result.start_top_log_probs.shape, (self.batch_size, model.config.start_n_top)) + self.parent.assertEqual(result.start_top_index.shape, (self.batch_size, model.config.start_n_top)) + self.parent.assertEqual( + result.end_top_log_probs.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top) + ) + self.parent.assertEqual( + result.end_top_index.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top) + ) + self.parent.assertEqual(result.cls_logits.shape, (self.batch_size,)) + + def create_and_check_flaubert_sequence_classif( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = FlaubertForSequenceClassification(config) + model.to(torch_device) + model.eval() + + result = model(input_ids) + result = model(input_ids, labels=sequence_labels) + + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def create_and_check_flaubert_token_classif( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_labels = self.num_labels + model = FlaubertForTokenClassification(config) + model.to(torch_device) + model.eval() + + result = model(input_ids, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_flaubert_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_choices = self.num_choices + model = FlaubertForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "lengths": input_lengths, + "attention_mask": input_mask, + } + return config, inputs_dict + + +@require_torch +class FlaubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + FlaubertModel, + FlaubertWithLMHeadModel, + FlaubertForQuestionAnswering, + FlaubertForQuestionAnsweringSimple, + FlaubertForSequenceClassification, + FlaubertForTokenClassification, + FlaubertForMultipleChoice, + ) + if is_torch_available() + else () + ) + # Doesn't run generation tests. Outdated custom `prepare_inputs_for_generation` -- TODO @gante + all_generative_model_classes = () + pipeline_model_mapping = ( + { + "feature-extraction": FlaubertModel, + "fill-mask": FlaubertWithLMHeadModel, + "question-answering": FlaubertForQuestionAnsweringSimple, + "text-classification": FlaubertForSequenceClassification, + "token-classification": FlaubertForTokenClassification, + "zero-shot": FlaubertForSequenceClassification, + } + if is_torch_available() and is_sacremoses_available() + else {} + ) + + # TODO: Fix the failed tests + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + if ( + pipeline_test_case_name == "QAPipelineTests" + and tokenizer_name is not None + and not tokenizer_name.endswith("Fast") + ): + # `QAPipelineTests` fails for a few models when the slower tokenizer are used. + # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework) + # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer + return True + + return False + + # Flaubert has 2 QA models -> need to manually set the correct labels for one of them here + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "FlaubertForQuestionAnswering": + inputs_dict["start_positions"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + inputs_dict["end_positions"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + + return inputs_dict + + def setUp(self): + self.model_tester = FlaubertModelTester(self) + self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_flaubert_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_model(*config_and_inputs) + + # Copied from tests/models/distilbert/test_modeling_distilbert.py with Distilbert->Flaubert + def test_flaubert_model_with_sinusoidal_encodings(self): + config = FlaubertConfig(sinusoidal_embeddings=True) + model = FlaubertModel(config=config) + sinusoidal_pos_embds = torch.empty((config.max_position_embeddings, config.emb_dim), dtype=torch.float32) + create_sinusoidal_embeddings(config.max_position_embeddings, config.emb_dim, sinusoidal_pos_embds) + self.model_tester.parent.assertTrue(torch.equal(model.position_embeddings.weight, sinusoidal_pos_embds)) + + def test_flaubert_lm_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_lm_head(*config_and_inputs) + + def test_flaubert_simple_qa(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_simple_qa(*config_and_inputs) + + def test_flaubert_qa(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_qa(*config_and_inputs) + + def test_flaubert_sequence_classif(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_sequence_classif(*config_and_inputs) + + def test_flaubert_token_classif(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_token_classif(*config_and_inputs) + + def test_flaubert_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_multiple_choice(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "flaubert/flaubert_small_cased" + model = FlaubertModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @slow + @require_torch_accelerator + def test_torchscript_device_change(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + # FlauBertForMultipleChoice behaves incorrectly in JIT environments. + if model_class == FlaubertForMultipleChoice: + self.skipTest(reason="FlauBertForMultipleChoice behaves incorrectly in JIT environments.") + + config.torchscript = True + model = model_class(config=config) + + inputs_dict = self._prepare_for_class(inputs_dict, model_class) + traced_model = torch.jit.trace( + model, (inputs_dict["input_ids"].to("cpu"), inputs_dict["attention_mask"].to("cpu")) + ) + + with tempfile.TemporaryDirectory() as tmp: + torch.jit.save(traced_model, os.path.join(tmp, "traced_model.pt")) + loaded = torch.jit.load(os.path.join(tmp, "traced_model.pt"), map_location=torch_device) + loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device)) + + +@require_torch +class FlaubertModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_no_head_absolute_embedding(self): + model = FlaubertModel.from_pretrained("flaubert/flaubert_base_cased") + input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) + with torch.no_grad(): + output = model(input_ids)[0] + expected_shape = torch.Size((1, 11, 768)) + self.assertEqual(output.shape, expected_shape) + expected_slice = torch.tensor( + [[[-2.6251, -1.4298, -0.0227], [-2.8510, -1.6387, 0.2258], [-2.8114, -1.1832, -0.3066]]] + ) + + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/flaubert/test_modeling_tf_flaubert.py b/docs/transformers/tests/models/flaubert/test_modeling_tf_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..1a2931c398fc8ad49e0747cfb29ef0009410662b --- /dev/null +++ b/docs/transformers/tests/models/flaubert/test_modeling_tf_flaubert.py @@ -0,0 +1,398 @@ +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +from transformers import is_tf_available +from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import numpy as np + import tensorflow as tf + + from transformers import ( + FlaubertConfig, + TFFlaubertForMultipleChoice, + TFFlaubertForQuestionAnsweringSimple, + TFFlaubertForSequenceClassification, + TFFlaubertForTokenClassification, + TFFlaubertModel, + TFFlaubertWithLMHeadModel, + ) + + +class TFFlaubertModelTester: + def __init__( + self, + parent, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_input_lengths = True + self.use_token_type_ids = True + self.use_labels = True + self.gelu_activation = True + self.sinusoidal_embeddings = False + self.causal = False + self.asm = False + self.n_langs = 2 + self.vocab_size = 99 + self.n_special = 0 + self.hidden_size = 32 + self.num_hidden_layers = 2 + self.num_attention_heads = 4 + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.summary_type = "last" + self.use_proj = True + self.scope = None + self.bos_token_id = 0 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + input_mask = random_attention_mask([self.batch_size, self.seq_length], dtype=tf.float32) + + input_lengths = None + if self.use_input_lengths: + input_lengths = ( + ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2 + ) # small variation of seq_length + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs) + + sequence_labels = None + token_labels = None + is_impossible_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = FlaubertConfig( + vocab_size=self.vocab_size, + n_special=self.n_special, + emb_dim=self.hidden_size, + n_layers=self.num_hidden_layers, + n_heads=self.num_attention_heads, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + gelu_activation=self.gelu_activation, + sinusoidal_embeddings=self.sinusoidal_embeddings, + asm=self.asm, + causal=self.causal, + n_langs=self.n_langs, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + summary_type=self.summary_type, + use_proj=self.use_proj, + bos_token_id=self.bos_token_id, + ) + + return ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ) + + def create_and_check_flaubert_model( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertModel(config=config) + inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} + result = model(inputs) + + inputs = [input_ids, input_mask] + result = model(inputs) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_flaubert_lm_head( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertWithLMHeadModel(config) + + inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} + result = model(inputs) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_flaubert_qa( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertForQuestionAnsweringSimple(config) + + inputs = {"input_ids": input_ids, "lengths": input_lengths} + + result = model(inputs) + + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_flaubert_sequence_classif( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertForSequenceClassification(config) + + inputs = {"input_ids": input_ids, "lengths": input_lengths} + + result = model(inputs) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def create_and_check_flaubert_for_token_classification( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_labels = self.num_labels + model = TFFlaubertForTokenClassification(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_flaubert_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_choices = self.num_choices + model = TFFlaubertForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "langs": token_type_ids, + "lengths": input_lengths, + } + return config, inputs_dict + + +@require_tf +class TFFlaubertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + TFFlaubertModel, + TFFlaubertWithLMHeadModel, + TFFlaubertForSequenceClassification, + TFFlaubertForQuestionAnsweringSimple, + TFFlaubertForTokenClassification, + TFFlaubertForMultipleChoice, + ) + if is_tf_available() + else () + ) + all_generative_model_classes = ( + (TFFlaubertWithLMHeadModel,) if is_tf_available() else () + ) # TODO (PVP): Check other models whether language generation is also applicable + pipeline_model_mapping = ( + { + "feature-extraction": TFFlaubertModel, + "fill-mask": TFFlaubertWithLMHeadModel, + "question-answering": TFFlaubertForQuestionAnsweringSimple, + "text-classification": TFFlaubertForSequenceClassification, + "token-classification": TFFlaubertForTokenClassification, + "zero-shot": TFFlaubertForSequenceClassification, + } + if is_tf_available() + else {} + ) + test_head_masking = False + test_onnx = False + + # TODO: Fix the failed tests + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + if ( + pipeline_test_case_name == "QAPipelineTests" + and tokenizer_name is not None + and not tokenizer_name.endswith("Fast") + ): + # `QAPipelineTests` fails for a few models when the slower tokenizer are used. + # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework) + # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer + return True + + return False + + def setUp(self): + self.model_tester = TFFlaubertModelTester(self) + self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_flaubert_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_model(*config_and_inputs) + + def test_flaubert_lm_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_lm_head(*config_and_inputs) + + def test_flaubert_qa(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_qa(*config_and_inputs) + + def test_flaubert_sequence_classif(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_sequence_classif(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_for_token_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_for_multiple_choice(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "hf-internal-testing/tiny-random-flaubert" + model = TFFlaubertModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_tf +@require_sentencepiece +@require_tokenizers +class TFFlaubertModelIntegrationTest(unittest.TestCase): + @slow + def test_output_embeds_base_model(self): + model = TFFlaubertModel.from_pretrained("jplu/tf-flaubert-small-cased") + + input_ids = tf.convert_to_tensor( + [[0, 158, 735, 2592, 1424, 6727, 82, 1]], + dtype=tf.int32, + ) # "J'aime flaubert !" + + output = model(input_ids)[0] + expected_shape = tf.TensorShape((1, 8, 512)) + self.assertEqual(output.shape, expected_shape) + # compare the actual values for a slice. + expected_slice = tf.convert_to_tensor( + [ + [ + [-1.8768773, -1.566555, 0.27072418], + [-1.6920038, -0.5873505, 1.9329599], + [-2.9563985, -1.6993835, 1.7972052], + ] + ], + dtype=tf.float32, + ) + + self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4)) diff --git a/docs/transformers/tests/models/flaubert/test_tokenization_flaubert.py b/docs/transformers/tests/models/flaubert/test_tokenization_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..30c65349883bc1983d226bc329535e632ed8acf3 --- /dev/null +++ b/docs/transformers/tests/models/flaubert/test_tokenization_flaubert.py @@ -0,0 +1,75 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the FlauBERT tokenizer.""" + +import json +import os +import unittest + +from transformers import FlaubertTokenizer +from transformers.models.flaubert.tokenization_flaubert import VOCAB_FILES_NAMES +from transformers.testing_utils import slow + +from ...test_tokenization_common import TokenizerTesterMixin + + +class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "flaubert/flaubert_base_cased" + tokenizer_class = FlaubertTokenizer + test_rust_tokenizer = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt + vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w", "r", "t", "i", "lo", "low", "ne", "new", "er", "low", "lowest", "new", "newer", "wider", ""] # fmt: skip + + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["n e 300", "ne w 301", "e r 302", ""] + + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + with open(cls.merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) + + # Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_full_tokenizer + def test_full_tokenizer(self): + tokenizer = self.get_tokenizer() + text = "lower newer" + bpe_tokens = ["l", "o", "w", "er", "new", "er"] + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [tokenizer.unk_token] + input_bpe_tokens = [0, 1, 2, 18, 17, 18, 24] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + @slow + # Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_sequence_builders + def test_sequence_builders(self): + tokenizer = FlaubertTokenizer.from_pretrained("flaubert/flaubert_base_cased") + + text = tokenizer.encode("sequence builders", add_special_tokens=False) + text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + print(encoded_sentence) + print(encoded_sentence) + + assert encoded_sentence == [0] + text + [1] + assert encoded_pair == [0] + text + [1] + text_2 + [1] diff --git a/docs/transformers/tests/models/flava/__init__.py b/docs/transformers/tests/models/flava/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/flava/test_image_processing_flava.py b/docs/transformers/tests/models/flava/test_image_processing_flava.py new file mode 100644 index 0000000000000000000000000000000000000000..5edb1997abbec7be832a221a90a98bc5fa3a57dc --- /dev/null +++ b/docs/transformers/tests/models/flava/test_image_processing_flava.py @@ -0,0 +1,432 @@ +# Copyright 2022 Meta Platforms authors and HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import unittest + +import numpy as np +import requests +from PIL import Image + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + import PIL + + from transformers import FlavaImageProcessor + + if is_torchvision_available(): + from transformers import FlavaImageProcessorFast + from transformers.image_utils import PILImageResampling + from transformers.models.flava.image_processing_flava import ( + FLAVA_CODEBOOK_MEAN, + FLAVA_CODEBOOK_STD, + FLAVA_IMAGE_MEAN, + FLAVA_IMAGE_STD, + ) +else: + FLAVA_IMAGE_MEAN = FLAVA_IMAGE_STD = FLAVA_CODEBOOK_MEAN = FLAVA_CODEBOOK_STD = None + + +class FlavaImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + resample=None, + do_rescale=True, + rescale_factor=1 / 255, + do_normalize=True, + image_mean=FLAVA_IMAGE_MEAN, + image_std=FLAVA_IMAGE_STD, + input_size_patches=14, + total_mask_patches=75, + mask_group_max_patches=None, + mask_group_min_patches=16, + mask_group_min_aspect_ratio=0.3, + mask_group_max_aspect_ratio=None, + codebook_do_resize=True, + codebook_size=None, + codebook_resample=None, + codebook_do_center_crop=True, + codebook_crop_size=None, + codebook_do_map_pixels=True, + codebook_do_normalize=True, + codebook_image_mean=FLAVA_CODEBOOK_MEAN, + codebook_image_std=FLAVA_CODEBOOK_STD, + ): + size = size if size is not None else {"height": 224, "width": 224} + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + codebook_size = codebook_size if codebook_size is not None else {"height": 112, "width": 112} + codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else {"height": 112, "width": 112} + + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.do_resize = do_resize + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.size = size + self.resample = resample if resample is not None else PILImageResampling.BICUBIC + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_center_crop = do_center_crop + self.crop_size = crop_size + + self.input_size_patches = input_size_patches + self.total_mask_patches = total_mask_patches + self.mask_group_max_patches = mask_group_max_patches + self.mask_group_min_patches = mask_group_min_patches + self.mask_group_min_aspect_ratio = mask_group_min_aspect_ratio + self.mask_group_max_aspect_ratio = mask_group_max_aspect_ratio + + self.codebook_do_resize = codebook_do_resize + self.codebook_size = codebook_size + # LANCZOS resample does not support torch Tensor. Use BICUBIC as closest alternative + self.codebook_resample = codebook_resample if codebook_resample is not None else PILImageResampling.BICUBIC + self.codebook_do_center_crop = codebook_do_center_crop + self.codebook_crop_size = codebook_crop_size + self.codebook_do_map_pixels = codebook_do_map_pixels + self.codebook_do_normalize = codebook_do_normalize + self.codebook_image_mean = codebook_image_mean + self.codebook_image_std = codebook_image_std + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + "resample": self.resample, + "do_rescale": self.do_rescale, + "rescale_factor": self.rescale_factor, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "input_size_patches": self.input_size_patches, + "total_mask_patches": self.total_mask_patches, + "mask_group_max_patches": self.mask_group_max_patches, + "mask_group_min_patches": self.mask_group_min_patches, + "mask_group_min_aspect_ratio": self.mask_group_min_aspect_ratio, + "mask_group_max_aspect_ratio": self.mask_group_min_aspect_ratio, + "codebook_do_resize": self.codebook_do_resize, + "codebook_size": self.codebook_size, + "codebook_resample": self.codebook_resample, + "codebook_do_center_crop": self.codebook_do_center_crop, + "codebook_crop_size": self.codebook_crop_size, + "codebook_do_map_pixels": self.codebook_do_map_pixels, + "codebook_do_normalize": self.codebook_do_normalize, + "codebook_image_mean": self.codebook_image_mean, + "codebook_image_std": self.codebook_image_std, + } + + def get_expected_image_size(self): + return (self.size["height"], self.size["width"]) + + def get_expected_mask_size(self): + return ( + (self.input_size_patches, self.input_size_patches) + if not isinstance(self.input_size_patches, tuple) + else self.input_size_patches + ) + + def get_expected_codebook_image_size(self): + return (self.codebook_size["height"], self.codebook_size["width"]) + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = FlavaImageProcessor if is_vision_available() else None + fast_image_processing_class = FlavaImageProcessorFast if is_torchvision_available() else None + maxDiff = None + + def setUp(self): + super().setUp() + self.image_processor_tester = FlavaImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "resample")) + self.assertTrue(hasattr(image_processing, "crop_size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "masking_generator")) + self.assertTrue(hasattr(image_processing, "codebook_do_resize")) + self.assertTrue(hasattr(image_processing, "codebook_size")) + self.assertTrue(hasattr(image_processing, "codebook_resample")) + self.assertTrue(hasattr(image_processing, "codebook_do_center_crop")) + self.assertTrue(hasattr(image_processing, "codebook_crop_size")) + self.assertTrue(hasattr(image_processing, "codebook_do_map_pixels")) + self.assertTrue(hasattr(image_processing, "codebook_do_normalize")) + self.assertTrue(hasattr(image_processing, "codebook_image_mean")) + self.assertTrue(hasattr(image_processing, "codebook_image_std")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 224, "width": 224}) + self.assertEqual(image_processor.crop_size, {"height": 224, "width": 224}) + self.assertEqual(image_processor.codebook_size, {"height": 112, "width": 112}) + self.assertEqual(image_processor.codebook_crop_size, {"height": 112, "width": 112}) + + image_processor = self.image_processing_class.from_dict( + self.image_processor_dict, size=42, crop_size=84, codebook_size=33, codebook_crop_size=66 + ) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) + self.assertEqual(image_processor.codebook_size, {"height": 33, "width": 33}) + self.assertEqual(image_processor.codebook_crop_size, {"height": 66, "width": 66}) + + def test_call_pil(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, PIL.Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt") + + # Test no bool masked pos + self.assertFalse("bool_masked_pos" in encoded_images) + + expected_height, expected_width = self.image_processor_tester.get_expected_image_size() + + self.assertEqual( + encoded_images.pixel_values.shape, + (1, self.image_processor_tester.num_channels, expected_height, expected_width), + ) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt") + expected_height, expected_width = self.image_processor_tester.get_expected_image_size() + + # Test no bool masked pos + self.assertFalse("bool_masked_pos" in encoded_images) + + self.assertEqual( + encoded_images.pixel_values.shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + expected_height, + expected_width, + ), + ) + + def _test_call_framework(self, instance_class, prepare_kwargs): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, **prepare_kwargs) + for image in image_inputs: + self.assertIsInstance(image, instance_class) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt") + + expected_height, expected_width = self.image_processor_tester.get_expected_image_size() + self.assertEqual( + encoded_images.pixel_values.shape, + (1, self.image_processor_tester.num_channels, expected_height, expected_width), + ) + + encoded_images = image_processing(image_inputs, return_image_mask=True, return_tensors="pt") + + expected_height, expected_width = self.image_processor_tester.get_expected_image_size() + self.assertEqual( + encoded_images.pixel_values.shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + expected_height, + expected_width, + ), + ) + + expected_height, expected_width = self.image_processor_tester.get_expected_mask_size() + self.assertEqual( + encoded_images.bool_masked_pos.shape, + ( + self.image_processor_tester.batch_size, + expected_height, + expected_width, + ), + ) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + + expected_height, expected_width = self.image_processor_tester.get_expected_image_size() + self.assertEqual( + encoded_images.shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + expected_height, + expected_width, + ), + ) + + # Test masking + encoded_images = image_processing(image_inputs, return_image_mask=True, return_tensors="pt") + + expected_height, expected_width = self.image_processor_tester.get_expected_image_size() + self.assertEqual( + encoded_images.pixel_values.shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + expected_height, + expected_width, + ), + ) + + expected_height, expected_width = self.image_processor_tester.get_expected_mask_size() + self.assertEqual( + encoded_images.bool_masked_pos.shape, + ( + self.image_processor_tester.batch_size, + expected_height, + expected_width, + ), + ) + + def test_call_numpy(self): + self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True}) + + def test_call_numpy_4_channels(self): + self.image_processing_class.num_channels = 4 + self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True}) + self.image_processing_class.num_channels = 3 + + def test_call_pytorch(self): + self._test_call_framework(torch.Tensor, prepare_kwargs={"torchify": True}) + + def test_masking(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + random.seed(1234) + image_processing = image_processing_class(**self.image_processor_dict) + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_image_mask=True, return_tensors="pt") + self.assertEqual(encoded_images.bool_masked_pos.sum().item(), 75) + + def test_codebook_pixels(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, PIL.Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_codebook_pixels=True, return_tensors="pt") + expected_height, expected_width = self.image_processor_tester.get_expected_codebook_image_size() + self.assertEqual( + encoded_images.codebook_pixel_values.shape, + (1, self.image_processor_tester.num_channels, expected_height, expected_width), + ) + + # Test batched + encoded_images = image_processing(image_inputs, return_codebook_pixels=True, return_tensors="pt") + expected_height, expected_width = self.image_processor_tester.get_expected_codebook_image_size() + self.assertEqual( + encoded_images.codebook_pixel_values.shape, + ( + self.image_processor_tester.batch_size, + self.image_processor_tester.num_channels, + expected_height, + expected_width, + ), + ) + + @require_vision + @require_torch + def test_slow_fast_equivalence(self): + if not self.test_slow_image_processor or not self.test_fast_image_processor: + self.skipTest(reason="Skipping slow/fast equivalence test") + + if self.image_processing_class is None or self.fast_image_processing_class is None: + self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined") + + dummy_image = Image.open( + requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw + ) + image_processor_slow = self.image_processing_class(**self.image_processor_dict) + image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict) + + encoding_slow = image_processor_slow( + dummy_image, return_tensors="pt", return_codebook_pixels=True, return_image_mask=True + ) + encoding_fast = image_processor_fast( + dummy_image, return_tensors="pt", return_codebook_pixels=True, return_image_mask=True + ) + self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1)) + self.assertLessEqual( + torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3 + ) + + self.assertTrue( + torch.allclose(encoding_slow.codebook_pixel_values, encoding_fast.codebook_pixel_values, atol=1e-1) + ) + self.assertLessEqual( + torch.mean(torch.abs(encoding_slow.codebook_pixel_values - encoding_fast.codebook_pixel_values)).item(), + 1e-3, + ) diff --git a/docs/transformers/tests/models/flava/test_modeling_flava.py b/docs/transformers/tests/models/flava/test_modeling_flava.py new file mode 100644 index 0000000000000000000000000000000000000000..acdac4dd87086e5f5cde3441c763b43accb05463 --- /dev/null +++ b/docs/transformers/tests/models/flava/test_modeling_flava.py @@ -0,0 +1,1368 @@ +# Copyright 2022 Meta Platforms authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch FLAVA model.""" + +import inspect +import os +import random +import tempfile +import unittest + +import numpy as np +import requests + +from transformers import ( + FlavaConfig, + FlavaImageCodebookConfig, + FlavaImageConfig, + FlavaMultimodalConfig, + FlavaTextConfig, +) +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + _config_zero_init, + floats_tensor, + ids_tensor, + random_attention_mask, +) +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ( + FlavaForPreTraining, + FlavaImageCodebook, + FlavaImageModel, + FlavaModel, + FlavaMultimodalModel, + FlavaTextModel, + ) +else: + FlavaModel = None + FlavaForPreTraining = None + torch = {} + + +if is_vision_available(): + from PIL import Image + + from transformers import FlavaProcessor + + +class FlavaImageModelTester: + def __init__( + self, + parent, + batch_size=12, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + image_size=30, + patch_size=2, + num_channels=3, + qkv_bias=True, + mask_token=True, + vocab_size=99, + ): + self.parent = parent + self.batch_size = batch_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.mask_token = mask_token + self.vocab_size = vocab_size + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + num_patches = self.image_size // self.patch_size + bool_masked_pos = ( + torch.rand((self.batch_size, num_patches, num_patches), device=pixel_values.device) < 0.9 + ).long() + config = self.get_config() + return config, pixel_values, bool_masked_pos + + def get_config(self): + return FlavaImageConfig( + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + initializer_range=self.initializer_range, + layer_norm_eps=self.layer_norm_eps, + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + qkv_bias=self.qkv_bias, + mask_token=self.mask_token, + vocab_size=self.vocab_size, + ) + + def create_and_check_model(self, config, pixel_values, bool_masked_pos): + model = FlavaImageModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values, bool_masked_pos) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, bool_masked_pos = config_and_inputs + inputs_dict = {"pixel_values": pixel_values, "bool_masked_pos": bool_masked_pos} + return config, inputs_dict + + +@require_torch +class FlavaImageModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as FLAVA does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (FlavaImageModel,) if is_torch_available() else () + + test_pruning = False + test_torchscript = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = FlavaImageModelTester(self) + self.config_tester = ConfigTester(self, config_class=FlavaImageConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip("Flava does not use input_ids") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + # in FLAVA, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token) + image_size = (self.model_tester.image_size, self.model_tester.image_size) + patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_len = num_patches + 1 + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # FLAVA has a different seq_length + image_size = (self.model_tester.image_size, self.model_tester.image_size) + patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_length = num_patches + 1 + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/flava-full" + model = FlavaImageModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class FlavaTextModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + vocab_size=102, + type_vocab_size=2, + max_position_embeddings=512, + position_embedding_type="absolute", + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + qkv_bias=True, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.seq_length = seq_length + self.vocab_size = vocab_size + self.type_vocab_size = type_vocab_size + self.max_position_embeddings = max_position_embeddings + self.position_embedding_type = position_embedding_type + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.qkv_bias = qkv_bias + self.pad_token_id = pad_token_id + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + token_type_ids = None + + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask + + def get_config(self): + return FlavaTextConfig( + vocab_size=self.vocab_size, + type_vocab_size=self.type_vocab_size, + max_position_embeddings=self.max_position_embeddings, + position_embedding_type=self.position_embedding_type, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + initializer_range=self.initializer_range, + layer_norm_eps=self.layer_norm_eps, + pad_token_id=self.pad_token_id, + qkv_bias=self.qkv_bias, + ) + + def create_and_check_model(self, config, input_ids, token_type_ids, input_mask): + model = FlavaTextModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, token_type_ids, input_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class FlavaTextModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (FlavaTextModel,) if is_torch_available() else () + test_pruning = False + test_head_masking = False + test_torchscript = False + + def setUp(self): + self.model_tester = FlavaTextModelTester(self) + self.config_tester = ConfigTester(self, config_class=FlavaTextConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="FLAVA does not use input_embeds") + def test_inputs_embeds(self): + # FLAVA does not use inputs_embeds + pass + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/flava-full" + model = FlavaTextModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class FlavaMultimodalModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=44, + use_input_mask=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + qkv_bias=True, + ce_ignore_index=-100, + use_cls_token=True, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.use_input_mask = use_input_mask + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.qkv_bias = qkv_bias + self.ce_ignore_index = ce_ignore_index + self.use_cls_token = use_cls_token + + def prepare_config_and_inputs(self): + hidden_states = floats_tensor([self.batch_size, self.seq_length - 1, self.hidden_size]) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + config = self.get_config() + + return config, hidden_states, input_mask + + def get_config(self): + return FlavaMultimodalConfig( + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + initializer_range=self.initializer_range, + layer_norm_eps=self.layer_norm_eps, + qkv_bias=self.qkv_bias, + use_cls_token=self.use_cls_token, + ce_ignore_index=self.ce_ignore_index, + ) + + def create_and_check_model(self, config, hidden_states, input_mask): + model = FlavaMultimodalModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(hidden_states, attention_mask=input_mask) + result = model(hidden_states) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, hidden_states, input_mask = config_and_inputs + inputs_dict = {"hidden_states": hidden_states, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class FlavaMultimodalModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (FlavaMultimodalModel,) if is_torch_available() else () + test_pruning = False + test_head_masking = False + test_resize_embeddings = False + test_torchscript = False + + def setUp(self): + self.model_tester = FlavaMultimodalModelTester(self) + self.config_tester = ConfigTester( + self, config_class=FlavaMultimodalConfig, has_text_modality=False, hidden_size=37 + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["hidden_states"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + @unittest.skip("FLAVA does not have input embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="FLAVA does not use input_embeds") + def test_inputs_embeds(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/flava-full" + model = FlavaMultimodalModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class FlavaImageCodebookTester: + def __init__( + self, + parent, + batch_size=12, + image_size=112, + num_channels=3, + hidden_size=32, + num_groups=2, + vocab_size=99, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.hidden_size = hidden_size + self.num_groups = num_groups + self.vocab_size = vocab_size + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return FlavaImageCodebookConfig( + hidden_size=self.hidden_size, num_groups=self.num_groups, vocab_size=self.vocab_size + ) + + def create_and_check_model(self, config, pixel_values): + model = FlavaImageCodebook(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + self.parent.assertEqual( + result.shape, (self.batch_size, config.vocab_size, self.image_size // 8, self.image_size // 8) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class FlavaImageCodebookTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (FlavaImageCodebook,) if is_torch_available() else () + test_pruning = False + test_head_masking = False + test_resize_embeddings = False + test_torchscript = False + has_attentions = False + + def setUp(self): + self.model_tester = FlavaImageCodebookTester(self) + self.config_tester = ConfigTester(self, config_class=FlavaImageCodebookConfig, has_text_modality=False) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + @unittest.skip(reason="Flava does not output attentions") + def test_attention_outputs(self): + pass + + @unittest.skip(reason="No embedding in multimodal model") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="FlavaImageCodebook has no attentions") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="FLAVA does not use input_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip + def test_model_outputs_equivalence(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/flava-full" + model = FlavaImageCodebook.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class FlavaModelTester: + model_class = FlavaModel + + def __init__( + self, + parent, + text_kwargs=None, + image_kwargs=None, + multimodal_kwargs=None, + image_codebook_kwargs=None, + is_training=True, + hidden_size=32, + projection_dim=32, + initializer_range=0.02, + layer_norm_eps=1e-12, + ): + if text_kwargs is None: + text_kwargs = {} + if image_kwargs is None: + image_kwargs = {} + if multimodal_kwargs is None: + multimodal_kwargs = {} + if image_codebook_kwargs is None: + image_codebook_kwargs = {} + + self.parent = parent + self.image_model_tester = FlavaImageModelTester(parent, **image_kwargs) + self.text_model_tester = FlavaTextModelTester(parent, **text_kwargs) + self.multimodal_model_tester = FlavaMultimodalModelTester(parent, **multimodal_kwargs) + self.image_codebook_tester = FlavaImageCodebookTester(parent, **image_codebook_kwargs) + self.is_training = is_training + self.config_tester = ConfigTester(self, config_class=FlavaConfig, hidden_size=37) + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test + + def test_config(self): + self.config_tester.run_common_tests() + + def prepare_config_and_inputs_for_common(self): + _, pixel_values, bool_masked_pos = self.image_model_tester.prepare_config_and_inputs() + _, input_ids, token_type_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "bool_masked_pos": bool_masked_pos, + } + + def get_config(self): + return FlavaConfig.from_configs( + self.image_model_tester.get_config(), + self.text_model_tester.get_config(), + self.multimodal_model_tester.get_config(), + self.image_codebook_tester.get_config(), + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + initializer_range=self.initializer_range, + layer_norm_eps=self.layer_norm_eps, + ) + + def create_and_check_model(self, config, inputs): + self._test_model(config, inputs, test_image=True) + self._test_model(config, inputs, test_text=True) + self._test_model(config, inputs, test_image=True, test_text=True) + + def _test_model(self, config, inputs, test_image=False, test_text=False): + model = self.model_class(config).to(torch_device).eval() + with torch.no_grad(): + result = model( + input_ids=inputs["input_ids"] if test_text else None, + attention_mask=inputs["attention_mask"] if test_text else None, + token_type_ids=inputs["token_type_ids"] if test_text else None, + pixel_values=inputs["pixel_values"] if test_image else None, + bool_masked_pos=inputs["bool_masked_pos"] if test_image else None, + ) + image_size = (self.image_model_tester.image_size, self.image_model_tester.image_size) + patch_size = (self.image_model_tester.patch_size, self.image_model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + + if test_image: + self.parent.assertEqual( + result.image_embeddings.shape, + (self.image_model_tester.batch_size, num_patches + 1, self.image_model_tester.hidden_size), + ) + else: + self.parent.assertIsNone(result.image_embeddings) + + if test_text: + self.parent.assertEqual( + result.text_embeddings.shape, + ( + self.text_model_tester.batch_size, + self.text_model_tester.seq_length, + self.text_model_tester.hidden_size, + ), + ) + else: + self.parent.assertIsNone(result.text_embeddings) + + if test_image and test_text: + self.parent.assertEqual( + result.multimodal_embeddings.shape, + ( + self.multimodal_model_tester.batch_size, + self.text_model_tester.seq_length + num_patches + 2, + self.multimodal_model_tester.hidden_size, + ), + ) + else: + self.parent.assertIsNone(result.multimodal_embeddings) + + +@require_torch +class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (FlavaModel,) if is_torch_available() else () + pipeline_model_mapping = {"feature-extraction": FlavaModel} if is_torch_available() else {} + class_for_tester = FlavaModelTester + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + + def setUp(self): + self.model_tester = self.class_for_tester(self) + common_properties = ["projection_dim", "logit_scale_init_value", "init_codebook"] + self.config_tester = ConfigTester( + self, config_class=FlavaConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="tested in individual model tests") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="FlavaModel does not have input/output embeddings") + def test_model_get_set_embeddings(self): + pass + + # override as the `logit_scale` parameter initilization is different for FLAVA + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + # check if `logit_scale` is initilized as per the original implementation + if name == "logit_scale" or name == "flava.logit_scale": + self.assertAlmostEqual( + param.data.item(), + np.log(1 / 0.07), + delta=1e-3, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def _create_and_check_torchscript(self, config, inputs_dict): + if not self.test_torchscript: + self.skipTest(reason="test_torchscript is set to False") + + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.torchscript = True + configs_no_init.return_dict = False + configs_no_init.return_loss = False + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + + try: + input_ids = inputs_dict["input_ids"] + pixel_values = inputs_dict["pixel_values"] # FLAVA needs pixel_values + + if "input_ids_masked" in inputs_dict: + # For pretraining + inputs = (input_ids, inputs_dict["input_ids_masked"], pixel_values) + else: + inputs = (input_ids, pixel_values) + + traced_model = torch.jit.trace(model, inputs) + except RuntimeError: + self.fail("Couldn't trace module.") + + with tempfile.TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") + + try: + torch.jit.save(traced_model, pt_file_name) + except Exception: + self.fail("Couldn't save module.") + + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load module.") + + model.to(torch_device) + model.eval() + + loaded_model.to(torch_device) + loaded_model.eval() + + model_state_dict = model.state_dict() + loaded_model_state_dict = loaded_model.state_dict() + # Non persistent buffers won't be in original state dict + loaded_model_state_dict.pop("text_model.embeddings.token_type_ids", None) + + non_persistent_buffers = {} + for key in loaded_model_state_dict.keys(): + if key not in model_state_dict.keys(): + non_persistent_buffers[key] = loaded_model_state_dict[key] + + loaded_model_state_dict = { + key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers + } + + self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) + + model_buffers = list(model.buffers()) + for non_persistent_buffer in non_persistent_buffers.values(): + found_buffer = False + for i, model_buffer in enumerate(model_buffers): + if torch.equal(non_persistent_buffer, model_buffer): + found_buffer = True + break + + self.assertTrue(found_buffer) + model_buffers.pop(i) + + models_equal = True + for layer_name, p1 in model_state_dict.items(): + p2 = loaded_model_state_dict[layer_name] + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + def test_load_image_text_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Save FlavaConfig and check if we can load FlavaImageConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + image_config = FlavaImageConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.image_config.to_dict(), image_config.to_dict()) + + # Save FlavaConfig and check if we can load FlavaTextConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + text_config = FlavaTextConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict()) + + # Save FlavaConfig and check if we can load FlavaMultimodalConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + multimodal_config = FlavaMultimodalConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.multimodal_config.to_dict(), multimodal_config.to_dict()) + + # overwrite from common since FlavaModel/TFFlavaModel return FLAVAOutput/TFFLAVAOutput + @slow + def test_model_from_pretrained(self): + model_name = "facebook/flava-full" + model = FlavaModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class FlavaForPreTrainingTester(FlavaModelTester): + model_class = FlavaForPreTraining + + def prepare_config_and_inputs_for_common(self): + _, pixel_values, bool_masked_pos = self.image_model_tester.prepare_config_and_inputs() + _, input_ids, token_type_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + config = self.get_config() + + input_ids_masked = input_ids.detach().clone() + input_ids_masked[:, 1:3] = 100 + mlm_labels = input_ids.detach().clone() + mlm_labels[:, :] = config.ce_ignore_index + mlm_labels[:, 1:3] = input_ids[:, 1:3] + mim_labels = torch.randint( + 0, self.image_model_tester.vocab_size, bool_masked_pos.size(), device=bool_masked_pos.device + ).long() + mim_labels[bool_masked_pos.ne(True)] = config.ce_ignore_index + itm_labels = torch.ones(mlm_labels.size(0), device=bool_masked_pos.device).long() + + return config, { + "input_ids": input_ids, + "input_ids_masked": input_ids_masked, + "token_type_ids": token_type_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "bool_masked_pos": bool_masked_pos, + "mlm_labels": mlm_labels, + "mim_labels": mim_labels, + "itm_labels": itm_labels, + "return_loss": True, + } + + def _test_model(self, config, inputs, test_image=False, test_text=False): + model = self.model_class(config).to(torch_device).eval() + with torch.no_grad(): + result = model( + input_ids=inputs["input_ids"] if test_text else None, + input_ids_masked=inputs["input_ids_masked"] if test_text else None, + attention_mask=inputs["attention_mask"] if test_text else None, + token_type_ids=inputs["token_type_ids"] if test_text else None, + pixel_values=inputs["pixel_values"] if test_image else None, + bool_masked_pos=inputs["bool_masked_pos"] if test_image else None, + mlm_labels=inputs["mlm_labels"], + mim_labels=inputs["mim_labels"], + itm_labels=inputs["itm_labels"], + return_loss=inputs["return_loss"], + ) + image_size = (self.image_model_tester.image_size, self.image_model_tester.image_size) + patch_size = (self.image_model_tester.patch_size, self.image_model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + + if test_image: + self.parent.assertEqual( + result.image_embeddings.shape, + (self.image_model_tester.batch_size, num_patches + 1, self.image_model_tester.hidden_size), + ) + if not test_text: + self.parent.assertEqual( + result.loss_info.mim.dim(), + 0, + ) + self.parent.assertEqual( + result.mim_logits.shape, + (inputs["bool_masked_pos"].sum().item(), self.image_model_tester.vocab_size), + ) + + else: + self.parent.assertIsNone(result.image_embeddings) + + if test_text: + self.parent.assertEqual( + result.text_embeddings.shape, + ( + self.text_model_tester.batch_size, + self.text_model_tester.seq_length, + self.text_model_tester.hidden_size, + ), + ) + if not test_image: + self.parent.assertEqual(result.loss_info.mlm.dim(), 0) + self.parent.assertEqual( + result.mlm_logits.shape, + ( + (inputs["mlm_labels"] != self.multimodal_model_tester.ce_ignore_index).sum().item(), + self.text_model_tester.vocab_size, + ), + ) + else: + self.parent.assertIsNone(result.text_embeddings) + + if test_image and test_text: + self.parent.assertEqual( + result.multimodal_masked_embeddings.shape, + ( + self.multimodal_model_tester.batch_size, + self.text_model_tester.seq_length + num_patches + 2, + self.multimodal_model_tester.hidden_size, + ), + ) + self.parent.assertEqual( + result.itm_logits.shape, + (self.text_model_tester.batch_size, 2), + ) + self.parent.assertEqual( + result.mmm_text_logits.shape, + ( + (inputs["mlm_labels"] != self.multimodal_model_tester.ce_ignore_index).sum().item(), + self.text_model_tester.vocab_size, + ), + ) + self.parent.assertEqual( + result.mmm_image_logits.shape, + (inputs["bool_masked_pos"].sum().item(), self.image_model_tester.vocab_size), + ) + self.parent.assertEqual( + result.contrastive_logits_per_image.shape, + (self.image_model_tester.batch_size, self.text_model_tester.batch_size), + ) + self.parent.assertEqual( + result.contrastive_logits_per_text.shape, + (self.text_model_tester.batch_size, self.image_model_tester.batch_size), + ) + + for item in [ + result.loss_info.global_contrastive, + result.loss_info.itm, + result.loss_info.mmm_text, + result.loss_info.mmm_image, + ]: + self.parent.assertEqual(item.dim(), 0) + + for item in [result.loss_info.mim, result.loss_info.mlm]: + self.parent.assertIsNone(item) + + else: + self.parent.assertIsNone(result.multimodal_masked_embeddings) + for item in [ + result.loss_info.global_contrastive, + result.loss_info.itm, + result.loss_info.mmm_text, + result.loss_info.mmm_image, + ]: + self.parent.assertIsNone(item) + + self.parent.assertIsNone(result.multimodal_embeddings) + + +@require_torch +class FlavaForPreTrainingTest(FlavaModelTest): + all_model_classes = (FlavaForPreTraining,) if is_torch_available() else () + class_for_tester = FlavaForPreTrainingTester + test_torchscript = False + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@require_vision +@require_torch +class FlavaModelIntegrationTest(unittest.TestCase): + @slow + def test_inference(self): + model_name = "facebook/flava-full" + model = FlavaModel.from_pretrained(model_name).to(torch_device) + processor = FlavaProcessor.from_pretrained(model_name) + + image = prepare_img() + inputs = processor( + text=["a photo of a cat", "a photo of a dog"], + images=[image, image], + padding="max_length", + max_length=77, + return_tensors="pt", + ).to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, return_dict=True) + + # verify the embeddings + self.assertAlmostEqual(outputs.image_embeddings.sum().item(), -1352.53540, places=4) + self.assertAlmostEqual(outputs.text_embeddings.sum().item(), -198.98225, places=4) + self.assertAlmostEqual(outputs.multimodal_embeddings.sum().item(), -4030.4604492, places=4) + + +@require_vision +@require_torch +class FlavaForPreTrainingIntegrationTest(unittest.TestCase): + @slow + def test_inference(self): + model_name = "facebook/flava-full" + model = FlavaForPreTraining.from_pretrained(model_name).to(torch_device) + processor = FlavaProcessor.from_pretrained(model_name) + torch.manual_seed(1) + random.seed(1) + + image = prepare_img() + inputs = processor( + text=["a photo of a cat", "a photo of a dog"], + images=[image, image], + padding="max_length", + max_length=77, + return_tensors="pt", + return_codebook_pixels=True, + return_image_mask=True, + ) + # Create a clone of the input_ids tensor that will be its masked version + inputs["input_ids_masked"] = inputs["input_ids"].clone() + # Mask the tokens "a" & "cat" from the "a photo of a cat" text using the special 103 value + inputs["input_ids_masked"][0, 4:6] = 103 + # MLM labels. It is a cloned version of input_ids where all values are -100 (i.e., ignored) + # except those that are masked, whose original values are stored + inputs["mlm_labels"] = inputs["input_ids"].clone() + inputs["mlm_labels"][:, :] = -100 + inputs["mlm_labels"][0, 4:6] = inputs["input_ids"][0, 4:6] + inputs = inputs.to(torch_device) + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + self.assertEqual( + outputs.contrastive_logits_per_image.shape, + torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), + ) + self.assertEqual( + outputs.contrastive_logits_per_text.shape, + torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), + ) + + expected_logits = torch.tensor([[16.1291, 8.4033], [16.1291, 8.4033]], device=torch_device) + torch.testing.assert_close(outputs.contrastive_logits_per_image, expected_logits, rtol=1e-3, atol=1e-3) + self.assertAlmostEqual(outputs.loss_info.mmm_text.item(), 2.0727925, places=4) + self.assertAlmostEqual(outputs.loss_info.mmm_image.item(), 7.0282096, places=4) + self.assertAlmostEqual(outputs.loss.item(), 11.3792324, places=4) + + @slow + def test_inference_with_itm_labels(self): + model_name = "facebook/flava-full" + model = FlavaForPreTraining.from_pretrained(model_name).to(torch_device) + processor = FlavaProcessor.from_pretrained(model_name) + torch.manual_seed(1) + random.seed(1) + + image = prepare_img() + inputs = processor( + text=["a photo of a cat", "a photo of a dog"], + images=[image, image], + padding="max_length", + max_length=77, + return_tensors="pt", + return_codebook_pixels=True, + return_image_mask=True, + ) + # Create a clone of the input_ids tensor that will be its masked version + inputs["input_ids_masked"] = inputs["input_ids"].clone() + # Mask the tokens "a" & "cat" from the "a photo of a cat" text using the special 103 value + inputs["input_ids_masked"][0, 4:6] = 103 + # MLM labels. It is a cloned version of input_ids where all values are -100 (i.e., ignored) + # except those that are masked, whose original values are stored + inputs["mlm_labels"] = inputs["input_ids"].clone() + inputs["mlm_labels"][:, :] = -100 + inputs["mlm_labels"][0, 4:6] = inputs["input_ids"][0, 4:6] + # Manually create the itm_labels tensor that indicates if the image-text match. + # In this case, the firs pair matches and the second does not + inputs["itm_labels"] = torch.tensor([1, 0]) + inputs = inputs.to(torch_device) + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + self.assertEqual( + outputs.contrastive_logits_per_image.shape, + torch.Size((torch.count_nonzero(inputs["itm_labels"]).item(), inputs.input_ids.shape[0])), + ) + self.assertEqual( + outputs.contrastive_logits_per_text.shape, + torch.Size((torch.count_nonzero(inputs["itm_labels"]).item(), inputs.pixel_values.shape[0])), + ) + + expected_logits = torch.tensor([[16.1291, 8.4033], [16.1291, 8.4033]], device=torch_device) + torch.testing.assert_close(outputs.contrastive_logits_per_image, expected_logits, rtol=1e-3, atol=1e-3) + self.assertAlmostEqual(outputs.loss_info.mmm_text.item(), 2.0727925, places=4) + self.assertAlmostEqual(outputs.loss_info.mmm_image.item(), 6.8965902, places=4) + self.assertAlmostEqual(outputs.loss.item(), 9.6084213, places=4) diff --git a/docs/transformers/tests/models/flava/test_processor_flava.py b/docs/transformers/tests/models/flava/test_processor_flava.py new file mode 100644 index 0000000000000000000000000000000000000000..8489322efd6901d543fd20aa59ecec6b0ca9bfac --- /dev/null +++ b/docs/transformers/tests/models/flava/test_processor_flava.py @@ -0,0 +1,234 @@ +# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import random +import shutil +import tempfile +import unittest + +import pytest + +from transformers import BertTokenizer, BertTokenizerFast +from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES +from transformers.testing_utils import require_vision +from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import FlavaImageProcessor, FlavaProcessor + from transformers.models.flava.image_processing_flava import ( + FLAVA_CODEBOOK_MEAN, + FLAVA_CODEBOOK_STD, + FLAVA_IMAGE_MEAN, + FLAVA_IMAGE_STD, + ) + + +@require_vision +class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = FlavaProcessor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + + vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", ",", "low", "lowest"] # fmt: skip + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + + with open(self.vocab_file, "w", encoding="utf-8") as fp: + fp.write("".join([x + "\n" for x in vocab_tokens])) + + image_processor_map = { + "image_mean": FLAVA_IMAGE_MEAN, + "image_std": FLAVA_IMAGE_STD, + "do_normalize": True, + "do_resize": True, + "size": 224, + "do_center_crop": True, + "crop_size": 224, + "input_size_patches": 14, + "total_mask_patches": 75, + "mask_group_max_patches": None, + "mask_group_min_patches": 16, + "mask_group_min_aspect_ratio": 0.3, + "mask_group_max_aspect_ratio": None, + "codebook_do_resize": True, + "codebook_size": 112, + "codebook_do_center_crop": True, + "codebook_crop_size": 112, + "codebook_do_map_pixels": True, + "codebook_do_normalize": True, + "codebook_image_mean": FLAVA_CODEBOOK_MEAN, + "codebook_image_std": FLAVA_CODEBOOK_STD, + } + + self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME) + with open(self.image_processor_file, "w", encoding="utf-8") as fp: + json.dump(image_processor_map, fp) + + def get_tokenizer(self, **kwargs): + return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) + + def get_rust_tokenizer(self, **kwargs): + return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + + def get_image_processor(self, **kwargs): + return FlavaImageProcessor.from_pretrained(self.tmpdirname, **kwargs) + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def test_save_load_pretrained_default(self): + tokenizer_slow = self.get_tokenizer() + tokenizer_fast = self.get_rust_tokenizer() + image_processor = self.get_image_processor() + + processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) + processor_slow.save_pretrained(self.tmpdirname) + processor_slow = FlavaProcessor.from_pretrained(self.tmpdirname, use_fast=False) + + processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast.save_pretrained(self.tmpdirname) + processor_fast = FlavaProcessor.from_pretrained(self.tmpdirname) + + self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) + self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) + self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) + self.assertIsInstance(processor_slow.tokenizer, BertTokenizer) + self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) + + self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertIsInstance(processor_slow.image_processor, FlavaImageProcessor) + self.assertIsInstance(processor_fast.image_processor, FlavaImageProcessor) + + def test_save_load_pretrained_additional_features(self): + processor = FlavaProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) + processor.save_pretrained(self.tmpdirname) + + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + + processor = FlavaProcessor.from_pretrained( + self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, BertTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, FlavaImageProcessor) + + def test_image_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) + + image_input = self.prepare_image_inputs() + + input_feat_extract = image_processor(image_input, return_tensors="np") + input_processor = processor(images=image_input, return_tensors="np") + + for key in input_feat_extract.keys(): + self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) + + # With rest of the args + random.seed(1234) + input_feat_extract = image_processor( + image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np" + ) + random.seed(1234) + input_processor = processor( + images=image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np" + ) + + for key in input_feat_extract.keys(): + self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) + + def test_tokenizer(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + + encoded_processor = processor(text=input_str) + + encoded_tok = tokenizer(input_str) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + def test_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"]) + + # add extra args + inputs = processor(text=input_str, images=image_input, return_codebook_pixels=True, return_image_mask=True) + + self.assertListEqual( + list(inputs.keys()), + [ + "input_ids", + "token_type_ids", + "attention_mask", + "pixel_values", + "codebook_pixel_values", + "bool_masked_pos", + ], + ) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + def test_tokenizer_decode(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + def test_model_input_names(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), processor.model_input_names) diff --git a/docs/transformers/tests/models/fnet/__init__.py b/docs/transformers/tests/models/fnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/fnet/test_modeling_fnet.py b/docs/transformers/tests/models/fnet/test_modeling_fnet.py new file mode 100644 index 0000000000000000000000000000000000000000..eca30656dd20bd396cfd621aa125d1d99a7f9d5e --- /dev/null +++ b/docs/transformers/tests/models/fnet/test_modeling_fnet.py @@ -0,0 +1,574 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch FNet model.""" + +import unittest + +from transformers import FNetConfig, is_torch_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_tokenizers, require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + MODEL_FOR_PRETRAINING_MAPPING, + FNetForMaskedLM, + FNetForMultipleChoice, + FNetForNextSentencePrediction, + FNetForPreTraining, + FNetForQuestionAnswering, + FNetForSequenceClassification, + FNetForTokenClassification, + FNetModel, + FNetTokenizerFast, + ) + + +# Override ConfigTester +class FNetConfigTester(ConfigTester): + def create_and_test_config_common_properties(self): + config = self.config_class(**self.inputs_dict) + if self.has_text_modality: + self.parent.assertTrue(hasattr(config, "vocab_size")) + self.parent.assertTrue(hasattr(config, "hidden_size")) + self.parent.assertTrue(hasattr(config, "num_hidden_layers")) + + +class FNetModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, sequence_labels, token_labels, choice_labels + + def get_config(self): + return FNetConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + tpu_short_seq_length=self.seq_length, + ) + + def create_and_check_model(self, config, input_ids, token_type_ids, sequence_labels, token_labels, choice_labels): + model = FNetModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_pretraining( + self, config, input_ids, token_type_ids, sequence_labels, token_labels, choice_labels + ): + model = FNetForPreTraining(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + token_type_ids=token_type_ids, + labels=token_labels, + next_sentence_label=sequence_labels, + ) + self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, sequence_labels, token_labels, choice_labels + ): + model = FNetForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, sequence_labels, token_labels, choice_labels + ): + model = FNetForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = FNetForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = FNetForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = FNetForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids} + return config, inputs_dict + + +@require_torch +class FNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + FNetModel, + FNetForPreTraining, + FNetForMaskedLM, + FNetForNextSentencePrediction, + FNetForMultipleChoice, + FNetForQuestionAnswering, + FNetForSequenceClassification, + FNetForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": FNetModel, + "fill-mask": FNetForMaskedLM, + "question-answering": FNetForQuestionAnswering, + "text-classification": FNetForSequenceClassification, + "token-classification": FNetForTokenClassification, + "zero-shot": FNetForSequenceClassification, + } + if is_torch_available() + else {} + ) + + # Skip Tests + test_pruning = False + test_head_masking = False + + # TODO: Fix the failed tests + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"): + return True + + return False + + # special case for ForPreTraining model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING): + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device + ) + inputs_dict["next_sentence_label"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + return inputs_dict + + # Overridden Tests + @unittest.skip + def test_attention_outputs(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def set_nan_tensor_to_zero(t): + t[t != t] = 0 + return t + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (list, tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + torch.allclose( + set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 + ), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:" + f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has" + f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}." + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs) + + # tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + # dict_inputs = self._prepare_for_class(inputs_dict, model_class) + # check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + def test_retain_grad_hidden_states_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + output = outputs[0] + + hidden_states = outputs.hidden_states[0] + + hidden_states.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(hidden_states.grad) + + def setUp(self): + self.model_tester = FNetModelTester(self) + self.config_tester = FNetConfigTester(self, config_class=FNetConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_pretraining(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_pretraining(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "google/fnet-base" + model = FNetModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_torch +class FNetModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_for_masked_lm(self): + """ + For comparison: + 1. Modify the pre-training model `__call__` to skip computing metrics and return masked_lm_output like so: + ``` + ... + sequence_output, pooled_output = EncoderModel( + self.config, random_seed=self.random_seed, name="encoder")( + input_ids, input_mask, type_ids, deterministic=deterministic) + + masked_lm_output = nn.Dense( + self.config.d_emb, + kernel_init=default_kernel_init, + name="predictions_dense")( + sequence_output) + masked_lm_output = nn.gelu(masked_lm_output) + masked_lm_output = nn.LayerNorm( + epsilon=LAYER_NORM_EPSILON, name="predictions_layer_norm")( + masked_lm_output) + masked_lm_logits = layers.OutputProjection( + kernel=self._get_embedding_table(), name="predictions_output")( + masked_lm_output) + + next_sentence_logits = layers.OutputProjection( + n_out=2, kernel_init=default_kernel_init, name="classification")( + pooled_output) + + return masked_lm_logits + ... + ``` + 2. Run the following: + >>> import jax.numpy as jnp + >>> import sentencepiece as spm + >>> from flax.training import checkpoints + >>> from f_net.models import PreTrainingModel + >>> from f_net.configs.pretraining import get_config, ModelArchitecture + + >>> pretrained_params = checkpoints.restore_checkpoint('./f_net/f_net_checkpoint', None) # Location of original checkpoint + >>> pretrained_config = get_config() + >>> pretrained_config.model_arch = ModelArchitecture.F_NET + + >>> vocab_filepath = "./f_net/c4_bpe_sentencepiece.model" # Location of the sentence piece model + >>> tokenizer = spm.SentencePieceProcessor() + >>> tokenizer.Load(vocab_filepath) + >>> with pretrained_config.unlocked(): + >>> pretrained_config.vocab_size = tokenizer.GetPieceSize() + >>> tokens = jnp.array([[0, 1, 2, 3, 4, 5]]) + >>> type_ids = jnp.zeros_like(tokens, dtype="i4") + >>> attention_mask = jnp.ones_like(tokens) # Dummy. This gets deleted inside the model. + + >>> flax_pretraining_model = PreTrainingModel(pretrained_config) + >>> pretrained_model_params = freeze(pretrained_params['target']) + >>> flax_model_outputs = flax_pretraining_model.apply({"params": pretrained_model_params}, tokens, attention_mask, type_ids, None, None, None, None, deterministic=True) + >>> masked_lm_logits[:, :3, :3] + """ + + model = FNetForMaskedLM.from_pretrained("google/fnet-base") + model.to(torch_device) + + input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]], device=torch_device) + with torch.no_grad(): + output = model(input_ids)[0] + + vocab_size = 32000 + + expected_shape = torch.Size((1, 6, vocab_size)) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor( + [[[-1.7819, -7.7384, -7.5002], [-3.4746, -8.5943, -7.7762], [-3.2052, -9.0771, -8.3468]]], + device=torch_device, + ) + + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + @slow + @require_tokenizers + def test_inference_long_sentence(self): + tokenizer = FNetTokenizerFast.from_pretrained("google/fnet-base") + + inputs = tokenizer( + "the man worked as a [MASK].", + "this is his [MASK].", + return_tensors="pt", + padding="max_length", + max_length=512, + ) + + torch.testing.assert_close(inputs["input_ids"], torch.tensor([[4, 13, 283, 2479, 106, 8, 6, 845, 5, 168, 65, 367, 6, 845, 5, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3]])) # fmt: skip + + inputs = {k: v.to(torch_device) for k, v in inputs.items()} + + model = FNetForMaskedLM.from_pretrained("google/fnet-base") + model.to(torch_device) + logits = model(**inputs).logits + predictions_mask_1 = tokenizer.decode(logits[0, 6].topk(5).indices) + predictions_mask_2 = tokenizer.decode(logits[0, 12].topk(5).indices) + + self.assertEqual(predictions_mask_1.split(" "), ["man", "child", "teacher", "woman", "model"]) + self.assertEqual(predictions_mask_2.split(" "), ["work", "wife", "job", "story", "name"]) + + @slow + def test_inference_for_next_sentence_prediction(self): + model = FNetForNextSentencePrediction.from_pretrained("google/fnet-base") + model.to(torch_device) + + input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]], device=torch_device) + with torch.no_grad(): + output = model(input_ids)[0] + + expected_shape = torch.Size((1, 2)) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor([[-0.2234, -0.0226]], device=torch_device) + + torch.testing.assert_close(output, expected_slice, rtol=1e-4, atol=1e-4) + + @slow + def test_inference_model(self): + model = FNetModel.from_pretrained("google/fnet-base") + model.to(torch_device) + + input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]], device=torch_device) + with torch.no_grad(): + output = model(input_ids)[0] + + expected_shape = torch.Size((1, 6, model.config.hidden_size)) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor( + [[[4.1541, -0.1051, -0.1667], [-0.9144, 0.2939, -0.0086], [-0.8472, -0.7281, 0.0256]]], device=torch_device + ) + + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/fnet/test_tokenization_fnet.py b/docs/transformers/tests/models/fnet/test_tokenization_fnet.py new file mode 100644 index 0000000000000000000000000000000000000000..b70aa33e0a1a6a13501085db91246eb33bbbd551 --- /dev/null +++ b/docs/transformers/tests/models/fnet/test_tokenization_fnet.py @@ -0,0 +1,451 @@ +# Copyright 2019 Hugging Face inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers import FNetTokenizer, FNetTokenizerFast +from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow, tooslow +from transformers.tokenization_utils import AddedToken + +from ...test_tokenization_common import TokenizerTesterMixin + + +SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") + + +@require_sentencepiece +@require_tokenizers +class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/fnet-base" + tokenizer_class = FNetTokenizer + rust_tokenizer_class = FNetTokenizerFast + test_rust_tokenizer = True + test_sentencepiece = True + test_sentencepiece_ignore_case = True + test_seq2seq = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # We have a SentencePiece fixture for testing + tokenizer = FNetTokenizer(SAMPLE_VOCAB) + tokenizer.save_pretrained(cls.tmpdirname) + + def get_input_output_texts(self, tokenizer): + input_text = "this is a test" + output_text = "this is a test" + return input_text, output_text + + def test_convert_token_and_id(self): + """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" + token = "" + token_id = 0 + + self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id) + self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token) + + def test_get_vocab(self): + vocab_keys = list(self.get_tokenizer().get_vocab().keys()) + + self.assertEqual(vocab_keys[0], "") + self.assertEqual(vocab_keys[1], "") + self.assertEqual(vocab_keys[-1], "▁eloquent") + self.assertEqual(len(vocab_keys), 30_000) + + def test_vocab_size(self): + self.assertEqual(self.get_tokenizer().vocab_size, 30_000) + + def test_rust_and_python_full_tokenizers(self): + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + sequence = "I was born in 92000, and this is falsé." + + tokens = tokenizer.tokenize(sequence) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(tokens, rust_tokens) + + ids = tokenizer.encode(sequence, add_special_tokens=False) + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + rust_tokenizer = self.get_rust_tokenizer() + ids = tokenizer.encode(sequence) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + def test_full_tokenizer(self): + tokenizer = FNetTokenizer(SAMPLE_VOCAB, keep_accents=True) + + tokens = tokenizer.tokenize("This is a test") + self.assertListEqual(tokens, ["▁", "T", "his", "▁is", "▁a", "▁test"]) + + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [13, 1, 4398, 25, 21, 1289]) + + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") + self.assertListEqual( + tokens, + ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."], + ) + ids = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual(ids, [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9]) + + back_tokens = tokenizer.convert_ids_to_tokens(ids) + self.assertListEqual( + back_tokens, + [ + "▁", + "", + "▁was", + "▁born", + "▁in", + "▁9", + "2000", + ",", + "▁and", + "▁this", + "▁is", + "▁fal", + "s", + "", + ".", + ], + ) + + def test_sequence_builders(self): + tokenizer = FNetTokenizer(SAMPLE_VOCAB) + + text = tokenizer.encode("sequence builders") + text_2 = tokenizer.encode("multi-sequence build") + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [ + tokenizer.sep_token_id + ] + + # Overridden Tests - loading the fast tokenizer from slow just takes too long + def test_special_tokens_initialization(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + added_tokens = [AddedToken("", lstrip=True)] + + tokenizer_r = self.get_rust_tokenizer( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + r_output = tokenizer_r.encode("Hey this is a token") + + special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0] + + self.assertTrue(special_token_id in r_output) + + if self.test_slow_tokenizer: + tokenizer_p = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + + p_output = tokenizer_p.encode("Hey this is a token") + + cr_output = tokenizer_r.encode("Hey this is a token") + + self.assertEqual(p_output, r_output) + self.assertEqual(cr_output, r_output) + self.assertTrue(special_token_id in p_output) + self.assertTrue(special_token_id in cr_output) + + @tooslow + def test_special_tokens_initialization_from_slow(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + added_tokens = [AddedToken("", lstrip=True)] + tokenizer_r = self.get_rust_tokenizer( + pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True + ) + special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0] + tokenizer_p = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + + p_output = tokenizer_p.encode("Hey this is a token") + cr_output = tokenizer_r.encode("Hey this is a token") + + self.assertEqual(p_output, cr_output) + self.assertTrue(special_token_id in p_output) + self.assertTrue(special_token_id in cr_output) + + # Overridden Tests + def test_padding(self, max_length=50): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_slow_tokenizer is set to False") + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + + self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) + pad_token_id = tokenizer_p.pad_token_id + + # Encode - Simple input + input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) + input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length") + input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length") + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.encode("This is a simple input", padding="longest") + input_p = tokenizer_p.encode("This is a simple input", padding=True) + self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) + + # Encode - Pair input + input_r = tokenizer_r.encode( + "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode( + "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True + ) + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode( + "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" + ) + input_p = tokenizer_p.encode( + "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" + ) + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True) + input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest") + self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) + + # Encode_plus - Simple input + input_r = tokenizer_r.encode_plus( + "This is a simple input", max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode_plus( + "This is a simple input", max_length=max_length, pad_to_max_length=True + ) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + + input_r = tokenizer_r.encode_plus( + "This is a simple input", max_length=max_length, padding="max_length" + ) + input_p = tokenizer_p.encode_plus( + "This is a simple input", max_length=max_length, padding="max_length" + ) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + + input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest") + input_p = tokenizer_p.encode_plus("This is a simple input", padding=True) + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + + # Encode_plus - Pair input + input_r = tokenizer_r.encode_plus( + "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode_plus( + "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True + ) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + + input_r = tokenizer_r.encode_plus( + "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" + ) + input_p = tokenizer_p.encode_plus( + "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" + ) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + + input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest") + input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True) + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + + # Batch_encode_plus - Simple input + input_r = tokenizer_r.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + pad_to_max_length=True, + ) + input_p = tokenizer_p.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + pad_to_max_length=True, + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + padding="max_length", + ) + input_p = tokenizer_p.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + padding="max_length", + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + padding="longest", + ) + input_p = tokenizer_p.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + padding=True, + ) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], padding="longest" + ) + input_p = tokenizer_p.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], padding=True + ) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Batch_encode_plus - Pair input + input_r = tokenizer_r.batch_encode_plus( + [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ], + max_length=max_length, + truncation=True, + padding="max_length", + ) + input_p = tokenizer_p.batch_encode_plus( + [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ], + max_length=max_length, + truncation=True, + padding="max_length", + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ], + padding=True, + ) + input_p = tokenizer_p.batch_encode_plus( + [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ], + padding="longest", + ) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Using pad on single examples after tokenization + input_r = tokenizer_r.encode_plus("This is a input 1") + input_r = tokenizer_r.pad(input_r) + + input_p = tokenizer_r.encode_plus("This is a input 1") + input_p = tokenizer_r.pad(input_p) + + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + + # Using pad on single examples after tokenization + input_r = tokenizer_r.encode_plus("This is a input 1") + input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") + + input_p = tokenizer_r.encode_plus("This is a input 1") + input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") + + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + + # Using pad after tokenization + input_r = tokenizer_r.batch_encode_plus( + ["This is a input 1", "This is a much longer input whilch should be padded"] + ) + input_r = tokenizer_r.pad(input_r) + + input_p = tokenizer_r.batch_encode_plus( + ["This is a input 1", "This is a much longer input whilch should be padded"] + ) + input_p = tokenizer_r.pad(input_p) + + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Using pad after tokenization + input_r = tokenizer_r.batch_encode_plus( + ["This is a input 1", "This is a much longer input whilch should be padded"] + ) + input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") + + input_p = tokenizer_r.batch_encode_plus( + ["This is a input 1", "This is a much longer input whilch should be padded"] + ) + input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") + + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + @slow + def test_save_pretrained(self): + super().test_save_pretrained() + + @slow + def test_save_slow_from_fast_and_reload_fast(self): + super().test_save_slow_from_fast_and_reload_fast() + + def assert_batch_padded_input_match( + self, + input_r: dict, + input_p: dict, + max_length: int, + pad_token_id: int, + model_main_input_name: str = "input_ids", + ): + for i_r in input_r.values(): + ( + self.assertEqual(len(i_r), 2), + self.assertEqual(len(i_r[0]), max_length), + self.assertEqual(len(i_r[1]), max_length), + ) + ( + self.assertEqual(len(i_r), 2), + self.assertEqual(len(i_r[0]), max_length), + self.assertEqual(len(i_r[1]), max_length), + ) + + for i_r, i_p in zip(input_r[model_main_input_name], input_p[model_main_input_name]): + self.assert_padded_input_match(i_r, i_p, max_length, pad_token_id) + + @slow + def test_tokenizer_integration(self): + expected_encoding = {'input_ids': [[4, 4616, 107, 163, 328, 14, 63, 1726, 106, 11954, 16659, 23, 83, 16688, 11427, 328, 107, 36, 11954, 16659, 23, 83, 16688, 6153, 82, 961, 16688, 3474, 16710, 1696, 2306, 16688, 10854, 2524, 3827, 561, 163, 3474, 16680, 62, 226, 2092, 16680, 379, 3474, 16660, 16680, 2436, 16667, 16671, 16680, 999, 87, 3474, 16680, 2436, 16667, 5208, 800, 16710, 68, 2018, 2959, 3037, 163, 16663, 11617, 16710, 36, 2018, 2959, 4737, 163, 16663, 16667, 16674, 16710, 91, 372, 5087, 16745, 2205, 82, 961, 3608, 38, 1770, 16745, 7984, 36, 2565, 751, 9017, 1204, 864, 218, 1244, 16680, 11954, 16659, 23, 83, 36, 14686, 23, 7619, 16678, 5], [4, 28, 532, 65, 1929, 33, 391, 16688, 3979, 9, 2565, 7849, 299, 225, 34, 2040, 305, 167, 289, 16667, 16078, 32, 1966, 181, 4626, 63, 10575, 71, 851, 1491, 36, 624, 4757, 38, 208, 8038, 16678, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [4, 13, 1467, 5187, 26, 2521, 4567, 16664, 372, 13, 16209, 3314, 16678, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip + + self.tokenizer_integration_test_util( + expected_encoding=expected_encoding, + model_name="google/fnet-base", + revision="34219a71ca20e280cc6000b89673a169c65d605c", + ) diff --git a/docs/transformers/tests/models/focalnet/__init__.py b/docs/transformers/tests/models/focalnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/focalnet/test_modeling_focalnet.py b/docs/transformers/tests/models/focalnet/test_modeling_focalnet.py new file mode 100644 index 0000000000000000000000000000000000000000..d272f25891038466e2f818d60bedce1877a01004 --- /dev/null +++ b/docs/transformers/tests/models/focalnet/test_modeling_focalnet.py @@ -0,0 +1,441 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch FocalNet model.""" + +import collections +import unittest + +from transformers import FocalNetConfig +from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...test_backbone_common import BackboneTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ( + FocalNetBackbone, + FocalNetForImageClassification, + FocalNetForMaskedImageModeling, + FocalNetModel, + ) + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class FocalNetModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=32, + patch_size=2, + num_channels=3, + embed_dim=16, + hidden_sizes=[32, 64, 128], + depths=[1, 2, 1], + num_heads=[2, 2, 4], + window_size=2, + mlp_ratio=2.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + use_absolute_embeddings=False, + patch_norm=True, + initializer_range=0.02, + layer_norm_eps=1e-5, + is_training=True, + scope=None, + use_labels=True, + type_sequence_label_size=10, + encoder_stride=8, + out_features=["stage1", "stage2"], + out_indices=[1, 2], + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.hidden_sizes = hidden_sizes + self.depths = depths + self.num_heads = num_heads + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.use_absolute_embeddings = use_absolute_embeddings + self.patch_norm = patch_norm + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.is_training = is_training + self.scope = scope + self.use_labels = use_labels + self.type_sequence_label_size = type_sequence_label_size + self.encoder_stride = encoder_stride + self.out_features = out_features + self.out_indices = out_indices + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return FocalNetConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + embed_dim=self.embed_dim, + hidden_sizes=self.hidden_sizes, + depths=self.depths, + num_heads=self.num_heads, + window_size=self.window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=self.qkv_bias, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + drop_path_rate=self.drop_path_rate, + hidden_act=self.hidden_act, + use_absolute_embeddings=self.use_absolute_embeddings, + path_norm=self.patch_norm, + layer_norm_eps=self.layer_norm_eps, + initializer_range=self.initializer_range, + encoder_stride=self.encoder_stride, + out_features=self.out_features, + out_indices=self.out_indices, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = FocalNetModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1)) + expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1)) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim)) + + def create_and_check_backbone(self, config, pixel_values, labels): + model = FocalNetBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), len(config.out_features)) + self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.image_size, 8, 8]) + + # verify channels + self.parent.assertEqual(len(model.channels), len(config.out_features)) + self.parent.assertListEqual(model.channels, config.hidden_sizes[:-1]) + + # verify backbone works with out_features=None + config.out_features = None + model = FocalNetBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), 1) + self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.image_size * 2, 4, 4]) + + # verify channels + self.parent.assertEqual(len(model.channels), 1) + self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]]) + + def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels): + model = FocalNetForMaskedImageModeling(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual( + result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size) + ) + + # test greyscale images + config.num_channels = 1 + model = FocalNetForMaskedImageModeling(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = FocalNetForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = FocalNetForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class FocalNetModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + FocalNetModel, + FocalNetForImageClassification, + FocalNetForMaskedImageModeling, + FocalNetBackbone, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + {"image-feature-extraction": FocalNetModel, "image-classification": FocalNetForImageClassification} + if is_torch_available() + else {} + ) + fx_compatible = False + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = FocalNetModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=FocalNetConfig, + embed_dim=37, + has_text_modality=False, + common_properties=["image_size", "patch_size", "num_channels", "hidden_sizes"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_backbone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_backbone(*config_and_inputs) + + def test_for_masked_image_modeling(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @unittest.skip(reason="FocalNet does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="FocalNet does not use feedforward chunking") + def test_feed_forward_chunking(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes[:-1]: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def check_hidden_states_output(self, inputs_dict, config, model_class, image_size): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # FocalNet has a different seq_length + patch_size = ( + config.patch_size + if isinstance(config.patch_size, collections.abc.Iterable) + else (config.patch_size, config.patch_size) + ) + + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [num_patches, self.model_tester.embed_dim], + ) + + reshaped_hidden_states = outputs.reshaped_hidden_states + self.assertEqual(len(reshaped_hidden_states), expected_num_layers) + + batch_size, num_channels, height, width = reshaped_hidden_states[0].shape + reshaped_hidden_states = ( + reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1) + ) + self.assertListEqual( + list(reshaped_hidden_states.shape[-2:]), + [num_patches, self.model_tester.embed_dim], + ) + + def test_hidden_states_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + image_size = ( + self.model_tester.image_size + if isinstance(self.model_tester.image_size, collections.abc.Iterable) + else (self.model_tester.image_size, self.model_tester.image_size) + ) + + for model_class in self.all_model_classes[:-1]: + inputs_dict["output_hidden_states"] = True + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + self.check_hidden_states_output(inputs_dict, config, model_class, image_size) + + def test_hidden_states_output_with_padding(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.patch_size = 3 + + image_size = ( + self.model_tester.image_size + if isinstance(self.model_tester.image_size, collections.abc.Iterable) + else (self.model_tester.image_size, self.model_tester.image_size) + ) + patch_size = ( + config.patch_size + if isinstance(config.patch_size, collections.abc.Iterable) + else (config.patch_size, config.patch_size) + ) + + padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0]) + padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1]) + + for model_class in self.all_model_classes[:-1]: + inputs_dict["output_hidden_states"] = True + self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width)) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width)) + + @slow + def test_model_from_pretrained(self): + model_name = "microsoft/focalnet-tiny" + model = FocalNetModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if "embeddings" not in name and param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + +@require_vision +@require_torch +class FocalNetModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + # TODO update organization + return AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny") if is_vision_available() else None + + @slow + def test_inference_image_classification_head(self): + model = FocalNetForImageClassification.from_pretrained("microsoft/focalnet-tiny").to(torch_device) + image_processor = self.default_image_processor + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + expected_slice = torch.tensor([0.2166, -0.4368, 0.2191]).to(torch_device) + torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4) + self.assertTrue(outputs.logits.argmax(dim=-1).item(), 281) + + +@require_torch +class FocalNetBackboneTest(BackboneTesterMixin, unittest.TestCase): + all_model_classes = (FocalNetBackbone,) if is_torch_available() else () + config_class = FocalNetConfig + + has_attentions = False + + def setUp(self): + self.model_tester = FocalNetModelTester(self) diff --git a/docs/transformers/tests/models/fsmt/__init__.py b/docs/transformers/tests/models/fsmt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/fsmt/test_modeling_fsmt.py b/docs/transformers/tests/models/fsmt/test_modeling_fsmt.py new file mode 100644 index 0000000000000000000000000000000000000000..e10a5fbae96143afa905d49fba832d392cffc898 --- /dev/null +++ b/docs/transformers/tests/models/fsmt/test_modeling_fsmt.py @@ -0,0 +1,605 @@ +# Copyright 2020 Huggingface +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +import unittest + +import timeout_decorator # noqa +from parameterized import parameterized + +from transformers import FSMTConfig, is_torch_available +from transformers.testing_utils import ( + require_sentencepiece, + require_tokenizers, + require_torch, + require_torch_fp16, + slow, + torch_device, +) +from transformers.utils import cached_property + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import FSMTForConditionalGeneration, FSMTModel, FSMTTokenizer + from transformers.models.fsmt.modeling_fsmt import ( + SinusoidalPositionalEmbedding, + _prepare_fsmt_decoder_inputs, + invert_mask, + shift_tokens_right, + ) + from transformers.pipelines import TranslationPipeline + + +class FSMTModelTester: + def __init__( + self, + parent, + src_vocab_size=99, + tgt_vocab_size=99, + langs=["ru", "en"], + batch_size=13, + seq_length=7, + is_training=False, + use_labels=False, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="relu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=20, + bos_token_id=0, + pad_token_id=1, + eos_token_id=2, + ): + self.parent = parent + self.src_vocab_size = src_vocab_size + self.tgt_vocab_size = tgt_vocab_size + self.langs = langs + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.bos_token_id = bos_token_id + self.pad_token_id = pad_token_id + self.eos_token_id = eos_token_id + torch.manual_seed(0) + + # hack needed for modeling_common tests - despite not really having this attribute in this model + self.vocab_size = self.src_vocab_size + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.src_vocab_size).clamp( + 3, + ) + input_ids[:, -1] = 2 # Eos Token + + config = self.get_config() + inputs_dict = prepare_fsmt_inputs_dict(config, input_ids) + return config, inputs_dict + + def get_config(self): + return FSMTConfig( + vocab_size=self.src_vocab_size, # hack needed for common tests + src_vocab_size=self.src_vocab_size, + tgt_vocab_size=self.tgt_vocab_size, + langs=self.langs, + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + eos_token_id=self.eos_token_id, + bos_token_id=self.bos_token_id, + pad_token_id=self.pad_token_id, + ) + + def prepare_config_and_inputs_for_common(self): + config, inputs_dict = self.prepare_config_and_inputs() + inputs_dict["decoder_input_ids"] = inputs_dict["input_ids"] + inputs_dict["decoder_attention_mask"] = inputs_dict["attention_mask"] + inputs_dict["use_cache"] = False + return config, inputs_dict + + +def prepare_fsmt_inputs_dict( + config, + input_ids, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, +): + if attention_mask is None: + attention_mask = input_ids.ne(config.pad_token_id) + if head_mask is None: + head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device) + if decoder_head_mask is None: + decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device) + if cross_attn_head_mask is None: + cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device) + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + } + + +@require_torch +class FSMTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (FSMTModel, FSMTForConditionalGeneration) if is_torch_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": FSMTModel, + "summarization": FSMTForConditionalGeneration, + "text2text-generation": FSMTForConditionalGeneration, + "translation": FSMTForConditionalGeneration, + } + if is_torch_available() + else {} + ) + is_encoder_decoder = True + test_pruning = False + test_missing_keys = False + + def setUp(self): + self.model_tester = FSMTModelTester(self) + self.langs = ["en", "ru"] + config = { + "langs": self.langs, + "src_vocab_size": 10, + "tgt_vocab_size": 20, + } + # XXX: hack to appease to all other models requiring `vocab_size` + config["vocab_size"] = 99 # no such thing in FSMT + self.config_tester = ConfigTester(self, config_class=FSMTConfig, **config) + + def test_config(self): + self.config_tester.run_common_tests() + + # XXX: override test_model_get_set_embeddings / different Embedding type + def test_model_get_set_embeddings(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding)) + model.set_input_embeddings(nn.Embedding(10, 10)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.modules.sparse.Embedding)) + + def test_initialization_more(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs() + model = FSMTModel(config) + model.to(torch_device) + model.eval() + # test init + # self.assertTrue((model.encoder.embed_tokens.weight == model.shared.weight).all().item()) + + def _check_var(module): + """Check that we initialized various parameters from N(0, config.init_std).""" + self.assertAlmostEqual(torch.std(module.weight).item(), config.init_std, 2) + + _check_var(model.encoder.embed_tokens) + _check_var(model.encoder.layers[0].self_attn.k_proj) + _check_var(model.encoder.layers[0].fc1) + # XXX: different std for fairseq version of SinusoidalPositionalEmbedding + # self.assertAlmostEqual(torch.std(model.encoder.embed_positions.weights).item(), config.init_std, 2) + + def test_advanced_inputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs() + config.use_cache = False + inputs_dict["input_ids"][:, -2:] = config.pad_token_id + decoder_input_ids, decoder_attn_mask, causal_mask = _prepare_fsmt_decoder_inputs( + config, inputs_dict["input_ids"] + ) + model = FSMTModel(config).to(torch_device).eval() + + decoder_features_with_created_mask = model(**inputs_dict)[0] + decoder_features_with_passed_mask = model( + decoder_attention_mask=invert_mask(decoder_attn_mask), decoder_input_ids=decoder_input_ids, **inputs_dict + )[0] + _assert_tensors_equal(decoder_features_with_passed_mask, decoder_features_with_created_mask) + useless_mask = torch.zeros_like(decoder_attn_mask) + decoder_features = model(decoder_attention_mask=useless_mask, **inputs_dict)[0] + self.assertTrue(isinstance(decoder_features, torch.Tensor)) # no hidden states or attentions + self.assertEqual( + decoder_features.size(), + (self.model_tester.batch_size, self.model_tester.seq_length, config.tgt_vocab_size), + ) + if decoder_attn_mask.min().item() < -1e3: # some tokens were masked + self.assertFalse((decoder_features_with_created_mask == decoder_features).all().item()) + + # Test different encoder attention masks + decoder_features_with_long_encoder_mask = model( + inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"].long() + )[0] + _assert_tensors_equal(decoder_features_with_long_encoder_mask, decoder_features_with_created_mask) + + def test_save_load_missing_keys(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs() + + for model_class in self.all_model_classes: + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) + self.assertEqual(info["missing_keys"], []) + + def test_ensure_weights_are_shared(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs() + + config.tie_word_embeddings = True + model = FSMTForConditionalGeneration(config) + + # FSMT shares three weights. + # Not an issue to not have these correctly tied for torch.load, but it is an issue for safetensors. + self.assertEqual( + len( + { + model.get_output_embeddings().weight.data_ptr(), + model.get_input_embeddings().weight.data_ptr(), + model.base_model.decoder.output_projection.weight.data_ptr(), + } + ), + 1, + ) + + config.tie_word_embeddings = False + model = FSMTForConditionalGeneration(config) + + # FSMT shares three weights. + # Not an issue to not have these correctly tied for torch.load, but it is an issue for safetensors. + self.assertEqual( + len( + { + model.get_output_embeddings().weight.data_ptr(), + model.get_input_embeddings().weight.data_ptr(), + model.base_model.decoder.output_projection.weight.data_ptr(), + } + ), + 2, + ) + + @unittest.skip(reason="can't be implemented for FSMT due to dual vocab.") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Passing inputs_embeds not implemented for FSMT.") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Input ids is required for FSMT.") + def test_inputs_embeds_matches_input_ids(self): + pass + + @unittest.skip(reason="model weights aren't tied in FSMT.") + def test_tie_model_weights(self): + pass + + @unittest.skip(reason="TODO: Decoder embeddings cannot be resized at the moment") + def test_resize_embeddings_untied(self): + pass + + +@require_torch +class FSMTHeadTests(unittest.TestCase): + src_vocab_size = 99 + tgt_vocab_size = 99 + langs = ["ru", "en"] + + def _get_config(self): + return FSMTConfig( + src_vocab_size=self.src_vocab_size, + tgt_vocab_size=self.tgt_vocab_size, + langs=self.langs, + d_model=24, + encoder_layers=2, + decoder_layers=2, + encoder_attention_heads=2, + decoder_attention_heads=2, + encoder_ffn_dim=32, + decoder_ffn_dim=32, + max_position_embeddings=48, + eos_token_id=2, + pad_token_id=1, + bos_token_id=0, + ) + + def _get_config_and_data(self): + input_ids = torch.tensor( + [ + [71, 82, 18, 33, 46, 91, 2], + [68, 34, 26, 58, 30, 82, 2], + [5, 97, 17, 39, 94, 40, 2], + [76, 83, 94, 25, 70, 78, 2], + [87, 59, 41, 35, 48, 66, 2], + [55, 13, 16, 58, 5, 2, 1], # note padding + [64, 27, 31, 51, 12, 75, 2], + [52, 64, 86, 17, 83, 39, 2], + [48, 61, 9, 24, 71, 82, 2], + [26, 1, 60, 48, 22, 13, 2], + [21, 5, 62, 28, 14, 76, 2], + [45, 98, 37, 86, 59, 48, 2], + [70, 70, 50, 9, 28, 0, 2], + ], + dtype=torch.long, + device=torch_device, + ) + + batch_size = input_ids.shape[0] + config = self._get_config() + return config, input_ids, batch_size + + def test_generate_beam_search(self): + input_ids = torch.tensor([[71, 82, 2], [68, 34, 2]], dtype=torch.long, device=torch_device) + config = self._get_config() + lm_model = FSMTForConditionalGeneration(config).to(torch_device) + lm_model.eval() + + max_length = 5 + new_input_ids = lm_model.generate( + input_ids.clone(), + do_sample=True, + num_return_sequences=1, + num_beams=2, + no_repeat_ngram_size=3, + max_length=max_length, + ) + self.assertEqual(new_input_ids.shape, (input_ids.shape[0], max_length)) + + def test_shift_tokens_right(self): + input_ids = torch.tensor([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype=torch.long) + shifted = shift_tokens_right(input_ids, 1) + n_pad_before = input_ids.eq(1).float().sum() + n_pad_after = shifted.eq(1).float().sum() + self.assertEqual(shifted.shape, input_ids.shape) + self.assertEqual(n_pad_after, n_pad_before - 1) + self.assertTrue(torch.eq(shifted[:, 0], 2).all()) + + @require_torch_fp16 + def test_generate_fp16(self): + config, input_ids, batch_size = self._get_config_and_data() + attention_mask = input_ids.ne(1).to(torch_device) + model = FSMTForConditionalGeneration(config).eval().to(torch_device) + model.half() + model.generate(input_ids, attention_mask=attention_mask) + model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3) + + def test_dummy_inputs(self): + config, *_ = self._get_config_and_data() + model = FSMTForConditionalGeneration(config).eval().to(torch_device) + model(**model.dummy_inputs) + + def test_prepare_fsmt_decoder_inputs(self): + config, *_ = self._get_config_and_data() + input_ids = _long_tensor([4, 4, 2]) + decoder_input_ids = _long_tensor([[26388, 2, config.pad_token_id]]) + causal_mask_dtype = torch.float32 + ignore = torch.finfo(causal_mask_dtype).min + decoder_input_ids, decoder_attn_mask, causal_mask = _prepare_fsmt_decoder_inputs( + config, input_ids, decoder_input_ids, causal_mask_dtype=causal_mask_dtype + ) + expected_causal_mask = torch.tensor( + [[0, ignore, ignore], [0, 0, ignore], [0, 0, 0]] # never attend to the final token, because its pad + ).to(input_ids.device) + self.assertEqual(decoder_attn_mask.size(), decoder_input_ids.size()) + self.assertTrue(torch.eq(expected_causal_mask, causal_mask).all()) + + +def _assert_tensors_equal(a, b, atol=1e-12, prefix=""): + """If tensors not close, or a and b arent both tensors, raise a nice Assertion error.""" + if a is None and b is None: + return True + try: + if torch.allclose(a, b, atol=atol): + return True + raise + except Exception: + if len(prefix) > 0: + prefix = f"{prefix}: " + raise AssertionError(f"{prefix}{a} != {b}") + + +def _long_tensor(tok_lst): + return torch.tensor(tok_lst, dtype=torch.long, device=torch_device) + + +TOLERANCE = 1e-4 + + +pairs = [ + ["en-ru"], + ["ru-en"], + ["en-de"], + ["de-en"], +] + + +@require_torch +@require_sentencepiece +@require_tokenizers +class FSMTModelIntegrationTests(unittest.TestCase): + tokenizers_cache = {} + models_cache = {} + default_mname = "facebook/wmt19-en-ru" + + @cached_property + def default_tokenizer(self): + return self.get_tokenizer(self.default_mname) + + @cached_property + def default_model(self): + return self.get_model(self.default_mname) + + def get_tokenizer(self, mname): + if mname not in self.tokenizers_cache: + self.tokenizers_cache[mname] = FSMTTokenizer.from_pretrained(mname) + return self.tokenizers_cache[mname] + + def get_model(self, mname): + if mname not in self.models_cache: + self.models_cache[mname] = FSMTForConditionalGeneration.from_pretrained(mname).to(torch_device) + if torch_device == "cuda": + self.models_cache[mname].half() + return self.models_cache[mname] + + @slow + def test_inference_no_head(self): + tokenizer = self.default_tokenizer + model = FSMTModel.from_pretrained(self.default_mname).to(torch_device) + + src_text = "My friend computer will translate this for me" + input_ids = tokenizer([src_text], return_tensors="pt")["input_ids"] + input_ids = _long_tensor(input_ids).to(torch_device) + inputs_dict = prepare_fsmt_inputs_dict(model.config, input_ids) + with torch.no_grad(): + output = model(**inputs_dict)[0] + expected_shape = torch.Size((1, 10, model.config.tgt_vocab_size)) + self.assertEqual(output.shape, expected_shape) + # expected numbers were generated when en-ru model, using just fairseq's model4.pt + # may have to adjust if switched to a different checkpoint + expected_slice = torch.tensor( + [[-1.5753, -1.5753, 2.8975], [-0.9540, -0.9540, 1.0299], [-3.3131, -3.3131, 0.5219]] + ).to(torch_device) + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=TOLERANCE, atol=TOLERANCE) + + def translation_setup(self, pair): + text = { + "en": "Machine learning is great, isn't it?", + "ru": "Машинное обучение - это здорово, не так ли?", + "de": "Maschinelles Lernen ist großartig, oder?", + } + + src, tgt = pair.split("-") + print(f"Testing {src} -> {tgt}") + mname = f"facebook/wmt19-{pair}" + + src_text = text[src] + tgt_text = text[tgt] + + tokenizer = self.get_tokenizer(mname) + model = self.get_model(mname) + return tokenizer, model, src_text, tgt_text + + @parameterized.expand(pairs) + @slow + def test_translation_direct(self, pair): + tokenizer, model, src_text, tgt_text = self.translation_setup(pair) + + input_ids = tokenizer.encode(src_text, return_tensors="pt").to(torch_device) + + outputs = model.generate(input_ids) + decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) + assert decoded == tgt_text, f"\n\ngot: {decoded}\nexp: {tgt_text}\n" + + @parameterized.expand(pairs) + @slow + def test_translation_pipeline(self, pair): + tokenizer, model, src_text, tgt_text = self.translation_setup(pair) + pipeline = TranslationPipeline(model, tokenizer, framework="pt", device=torch_device) + output = pipeline([src_text]) + self.assertEqual([tgt_text], [x["translation_text"] for x in output]) + + +@require_torch +class TestSinusoidalPositionalEmbeddings(unittest.TestCase): + padding_idx = 1 + tolerance = 1e-4 + + def test_basic(self): + input_ids = torch.tensor([[4, 10]], dtype=torch.long, device=torch_device) + emb1 = SinusoidalPositionalEmbedding(num_positions=6, embedding_dim=6, padding_idx=self.padding_idx).to( + torch_device + ) + emb1.make_weight(*emb1.weight.shape, emb1.padding_idx) + emb = emb1(input_ids) + desired_weights = torch.tensor( + [ + [9.0930e-01, 1.9999e-02, 2.0000e-04, -4.1615e-01, 9.9980e-01, 1.0000e00], + [1.4112e-01, 2.9995e-02, 3.0000e-04, -9.8999e-01, 9.9955e-01, 1.0000e00], + ] + ).to(torch_device) + self.assertTrue( + torch.allclose(emb[0], desired_weights, atol=self.tolerance), + msg=f"\nexp:\n{desired_weights}\ngot:\n{emb[0]}\n", + ) + + def test_odd_embed_dim(self): + # odd embedding_dim is allowed + test = SinusoidalPositionalEmbedding(num_positions=4, embedding_dim=5, padding_idx=self.padding_idx).to( + torch_device + ) + test.make_weight(*test.weight.shape, test.padding_idx) + + # odd num_embeddings is allowed + test = SinusoidalPositionalEmbedding(num_positions=5, embedding_dim=4, padding_idx=self.padding_idx).to( + torch_device + ) + test.make_weight(*test.weight.shape, test.padding_idx) + + @unittest.skip(reason="different from marian (needs more research)") + def test_positional_emb_weights_against_marian(self): + desired_weights = torch.tensor( + [ + [0, 0, 0, 0, 0], + [0.84147096, 0.82177866, 0.80180490, 0.78165019, 0.76140374], + [0.90929741, 0.93651021, 0.95829457, 0.97505713, 0.98720258], + ] + ) + emb1 = SinusoidalPositionalEmbedding(num_positions=512, embedding_dim=512, padding_idx=self.padding_idx).to( + torch_device + ) + emb1.make_weight(*emb1.weight.shape, emb1.padding_idx) + weights = emb1.weights.data[:3, :5] + # XXX: only the 1st and 3rd lines match - this is testing against + # verbatim copy of SinusoidalPositionalEmbedding from fairseq + self.assertTrue( + torch.allclose(weights, desired_weights, atol=self.tolerance), + msg=f"\nexp:\n{desired_weights}\ngot:\n{weights}\n", + ) + + # test that forward pass is just a lookup, there is no ignore padding logic + input_ids = torch.tensor( + [[4, 10, self.padding_idx, self.padding_idx, self.padding_idx]], dtype=torch.long, device=torch_device + ) + no_cache_pad_zero = emb1(input_ids)[0] + # XXX: only the 1st line matches the 3rd + torch.testing.assert_close( + torch.tensor(desired_weights, device=torch_device), no_cache_pad_zero[:3, :5], rtol=1e-3, atol=1e-3 + ) diff --git a/docs/transformers/tests/models/fsmt/test_tokenization_fsmt.py b/docs/transformers/tests/models/fsmt/test_tokenization_fsmt.py new file mode 100644 index 0000000000000000000000000000000000000000..bfaf3df195935dd94ea8714fee0032d8e981d6fc --- /dev/null +++ b/docs/transformers/tests/models/fsmt/test_tokenization_fsmt.py @@ -0,0 +1,169 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import unittest + +from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES, FSMTTokenizer +from transformers.testing_utils import slow +from transformers.utils import cached_property + +from ...test_tokenization_common import TokenizerTesterMixin + + +# using a different tiny model than the one used for default params defined in init to ensure proper testing +FSMT_TINY2 = "stas/tiny-wmt19-en-ru" + + +class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "stas/tiny-wmt19-en-de" + tokenizer_class = FSMTTokenizer + test_rust_tokenizer = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt + vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "w", + "r", + "t", + "lo", + "low", + "er", + "low", + "lowest", + "newer", + "wider", + "", + ] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["l o 123", "lo w 1456", "e r 1789", ""] + + cls.langs = ["en", "ru"] + config = { + "langs": cls.langs, + "src_vocab_size": 10, + "tgt_vocab_size": 20, + } + + cls.src_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"]) + cls.tgt_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"]) + config_file = os.path.join(cls.tmpdirname, "tokenizer_config.json") + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.src_vocab_file, "w") as fp: + fp.write(json.dumps(vocab_tokens)) + with open(cls.tgt_vocab_file, "w") as fp: + fp.write(json.dumps(vocab_tokens)) + with open(cls.merges_file, "w") as fp: + fp.write("\n".join(merges)) + with open(config_file, "w") as fp: + fp.write(json.dumps(config)) + + @cached_property + def tokenizer_ru_en(self): + return FSMTTokenizer.from_pretrained("facebook/wmt19-ru-en") + + @cached_property + def tokenizer_en_ru(self): + return FSMTTokenizer.from_pretrained("facebook/wmt19-en-ru") + + def test_online_tokenizer_config(self): + """this just tests that the online tokenizer files get correctly fetched and + loaded via its tokenizer_config.json and it's not slow so it's run by normal CI + """ + tokenizer = FSMTTokenizer.from_pretrained(FSMT_TINY2) + self.assertListEqual([tokenizer.src_lang, tokenizer.tgt_lang], ["en", "ru"]) + self.assertEqual(tokenizer.src_vocab_size, 21) + self.assertEqual(tokenizer.tgt_vocab_size, 21) + + def test_full_tokenizer(self): + """Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt""" + tokenizer = FSMTTokenizer(self.langs, self.src_vocab_file, self.tgt_vocab_file, self.merges_file) + + text = "lower" + bpe_tokens = ["low", "er"] + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [""] + input_bpe_tokens = [14, 15, 20] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + @slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_ru_en + + text = tokenizer.encode("sequence builders", add_special_tokens=False) + text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_sentence == text + [2] + assert encoded_pair == text + [2] + text_2 + [2] + + @slow + def test_match_encode_decode(self): + tokenizer_enc = self.tokenizer_en_ru + tokenizer_dec = self.tokenizer_ru_en + + targets = [ + [ + "Here's a little song I wrote. Don't worry, be happy.", + [2470, 39, 11, 2349, 7222, 70, 5979, 7, 8450, 1050, 13160, 5, 26, 6445, 7, 2], + ], + ["This is it. No more. I'm done!", [132, 21, 37, 7, 1434, 86, 7, 70, 6476, 1305, 427, 2]], + ] + + # if data needs to be recreated or added, run: + # import torch + # model = torch.hub.load("pytorch/fairseq", "transformer.wmt19.en-ru", checkpoint_file="model4.pt", tokenizer="moses", bpe="fastbpe") + # for src_text, _ in targets: print(f"""[\n"{src_text}",\n {model.encode(src_text).tolist()}\n],""") + + for src_text, tgt_input_ids in targets: + encoded_ids = tokenizer_enc.encode(src_text, return_tensors=None) + self.assertListEqual(encoded_ids, tgt_input_ids) + + # and decode backward, using the reversed languages model + decoded_text = tokenizer_dec.decode(encoded_ids, skip_special_tokens=True) + self.assertEqual(decoded_text, src_text) + + @slow + def test_tokenizer_lower(self): + tokenizer = FSMTTokenizer.from_pretrained("facebook/wmt19-ru-en", do_lower_case=True) + tokens = tokenizer.tokenize("USA is United States of America") + expected = ["us", "a", "is", "un", "i", "ted", "st", "ates", "of", "am", "er", "ica"] + self.assertListEqual(tokens, expected) + + @unittest.skip(reason="FSMTConfig.__init__ requires non-optional args") + def test_torch_encode_plus_sent_to_model(self): + pass + + @unittest.skip(reason="FSMTConfig.__init__ requires non-optional args") + def test_np_encode_plus_sent_to_model(self): + pass diff --git a/docs/transformers/tests/models/funnel/__init__.py b/docs/transformers/tests/models/funnel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/funnel/test_modeling_funnel.py b/docs/transformers/tests/models/funnel/test_modeling_funnel.py new file mode 100644 index 0000000000000000000000000000000000000000..3d28924bee1c2c32b2a2623d20a8e181e4475500 --- /dev/null +++ b/docs/transformers/tests/models/funnel/test_modeling_funnel.py @@ -0,0 +1,524 @@ +# Copyright 2020 HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import FunnelConfig, FunnelTokenizer, is_torch_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + MODEL_FOR_PRETRAINING_MAPPING, + FunnelBaseModel, + FunnelForMaskedLM, + FunnelForMultipleChoice, + FunnelForPreTraining, + FunnelForQuestionAnswering, + FunnelForSequenceClassification, + FunnelForTokenClassification, + FunnelModel, + ) + + +class FunnelModelTester: + """You can also import this e.g, from .test_modeling_funnel import FunnelModelTester""" + + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + block_sizes=[1, 1, 2], + num_decoder_layers=1, + d_model=32, + n_head=4, + d_head=8, + d_inner=37, + hidden_act="gelu_new", + hidden_dropout=0.1, + attention_dropout=0.1, + activation_dropout=0.0, + max_position_embeddings=512, + type_vocab_size=3, + initializer_std=0.02, # Set to a smaller value, so we can keep the small error threshold (1e-5) in the test + num_labels=3, + num_choices=4, + scope=None, + base=False, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.block_sizes = block_sizes + self.num_decoder_layers = num_decoder_layers + self.d_model = d_model + self.n_head = n_head + self.d_head = d_head + self.d_inner = d_inner + self.hidden_act = hidden_act + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = 2 + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.initializer_std = initializer_std + + # Used in the tests to check the size of the first attention layer + self.num_attention_heads = n_head + # Used in the tests to check the size of the first hidden state + self.hidden_size = self.d_model + # Used in the tests to check the number of output hidden states/attentions + self.num_hidden_layers = sum(self.block_sizes) + (0 if base else self.num_decoder_layers) + # FunnelModel adds two hidden layers: input embeddings and the sum of the upsampled encoder hidden state with + # the last hidden state of the first block (which is the first hidden state of the decoder). + if not base: + self.expected_num_hidden_layers = self.num_hidden_layers + 2 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1) + + config = self.get_config() + + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ) + + def get_config(self): + return FunnelConfig( + vocab_size=self.vocab_size, + block_sizes=self.block_sizes, + num_decoder_layers=self.num_decoder_layers, + d_model=self.d_model, + n_head=self.n_head, + d_head=self.d_head, + d_inner=self.d_inner, + hidden_act=self.hidden_act, + hidden_dropout=self.hidden_dropout, + attention_dropout=self.attention_dropout, + activation_dropout=self.activation_dropout, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_std=self.initializer_std, + ) + + def create_and_check_model( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + model = FunnelModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model)) + + model.config.truncate_seq = False + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model)) + + model.config.separate_cls = False + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model)) + + def create_and_check_base_model( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + model = FunnelBaseModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model)) + + model.config.truncate_seq = False + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 3, self.d_model)) + + model.config.separate_cls = False + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model)) + + def create_and_check_for_pretraining( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + config.num_labels = self.num_labels + model = FunnelForPreTraining(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_masked_lm( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + model = FunnelForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_sequence_classification( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + config.num_labels = self.num_labels + model = FunnelForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + config.num_choices = self.num_choices + model = FunnelForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_for_token_classification( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + config.num_labels = self.num_labels + model = FunnelForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_question_answering( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ): + model = FunnelForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + fake_token_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class FunnelModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + test_head_masking = False + test_pruning = False + all_model_classes = ( + ( + FunnelModel, + FunnelForMaskedLM, + FunnelForPreTraining, + FunnelForQuestionAnswering, + FunnelForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": (FunnelBaseModel, FunnelModel), + "fill-mask": FunnelForMaskedLM, + "question-answering": FunnelForQuestionAnswering, + "text-classification": FunnelForSequenceClassification, + "token-classification": FunnelForTokenClassification, + "zero-shot": FunnelForSequenceClassification, + } + if is_torch_available() + else {} + ) + + # special case for ForPreTraining model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING): + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device + ) + return inputs_dict + + def setUp(self): + self.model_tester = FunnelModelTester(self) + self.config_tester = ConfigTester(self, config_class=FunnelConfig) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_pretraining(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_pretraining(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + # overwrite from test_modeling_common + def _mock_init_weights(self, module): + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(3) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.fill_(3) + + for param in ["r_w_bias", "r_r_bias", "r_kernel", "r_s_bias", "seg_embed"]: + if hasattr(module, param) and getattr(module, param) is not None: + weight = getattr(module, param) + weight.data.fill_(3) + + +@require_torch +class FunnelBaseModelTest(ModelTesterMixin, unittest.TestCase): + test_head_masking = False + test_pruning = False + all_model_classes = ( + (FunnelBaseModel, FunnelForMultipleChoice, FunnelForSequenceClassification) if is_torch_available() else () + ) + + def setUp(self): + self.model_tester = FunnelModelTester(self, base=True) + self.config_tester = ConfigTester(self, config_class=FunnelConfig) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_base_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_base_model(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + # overwrite from test_modeling_common + def test_training(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + if model_class.__name__ == "FunnelBaseModel": + continue + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + # overwrite from test_modeling_common + def _mock_init_weights(self, module): + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(3) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.fill_(3) + + for param in ["r_w_bias", "r_r_bias", "r_kernel", "r_s_bias", "seg_embed"]: + if hasattr(module, param) and getattr(module, param) is not None: + weight = getattr(module, param) + weight.data.fill_(3) + + +@require_torch +@require_sentencepiece +@require_tokenizers +class FunnelModelIntegrationTest(unittest.TestCase): + def test_inference_tiny_model(self): + batch_size = 13 + sequence_length = 7 + input_ids = torch.arange(0, batch_size * sequence_length).long().reshape(batch_size, sequence_length) + lengths = [0, 1, 2, 3, 4, 5, 6, 4, 1, 3, 5, 0, 1] + token_type_ids = torch.tensor([[2] + [0] * a + [1] * (sequence_length - a - 1) for a in lengths]) + + model = FunnelModel.from_pretrained("sgugger/funnel-random-tiny") + output = model(input_ids, token_type_ids=token_type_ids)[0].abs() + + expected_output_sum = torch.tensor(2344.8352) + expected_output_mean = torch.tensor(0.8052) + torch.testing.assert_close(output.sum(), expected_output_sum, rtol=1e-4, atol=1e-4) + torch.testing.assert_close(output.mean(), expected_output_mean, rtol=1e-4, atol=1e-4) + + attention_mask = torch.tensor([[1] * 7, [1] * 4 + [0] * 3] * 6 + [[0, 1, 1, 0, 0, 1, 1]]) + output = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0].abs() + + expected_output_sum = torch.tensor(2343.8425) + expected_output_mean = torch.tensor(0.8049) + torch.testing.assert_close(output.sum(), expected_output_sum, rtol=1e-4, atol=1e-4) + torch.testing.assert_close(output.mean(), expected_output_mean, rtol=1e-4, atol=1e-4) + + @slow + def test_inference_model(self): + tokenizer = FunnelTokenizer.from_pretrained("huggingface/funnel-small") + model = FunnelModel.from_pretrained("huggingface/funnel-small") + inputs = tokenizer("Hello! I am the Funnel Transformer model.", return_tensors="pt") + output = model(**inputs)[0] + + expected_output_sum = torch.tensor(235.7246) + expected_output_mean = torch.tensor(0.0256) + torch.testing.assert_close(output.sum(), expected_output_sum, rtol=1e-4, atol=1e-4) + torch.testing.assert_close(output.mean(), expected_output_mean, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/funnel/test_modeling_tf_funnel.py b/docs/transformers/tests/models/funnel/test_modeling_tf_funnel.py new file mode 100644 index 0000000000000000000000000000000000000000..673982eb7b65435bf96147b7e4e27081078299b2 --- /dev/null +++ b/docs/transformers/tests/models/funnel/test_modeling_tf_funnel.py @@ -0,0 +1,414 @@ +# Copyright 2020 HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import unittest + +from transformers import FunnelConfig, is_tf_available +from transformers.testing_utils import require_tf + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import ( + TFFunnelBaseModel, + TFFunnelForMaskedLM, + TFFunnelForMultipleChoice, + TFFunnelForPreTraining, + TFFunnelForQuestionAnswering, + TFFunnelForSequenceClassification, + TFFunnelForTokenClassification, + TFFunnelModel, + ) + + +class TFFunnelModelTester: + """You can also import this e.g, from .test_modeling_funnel import FunnelModelTester""" + + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + block_sizes=[1, 1, 2], + num_decoder_layers=1, + d_model=32, + n_head=4, + d_head=8, + d_inner=37, + hidden_act="gelu_new", + hidden_dropout=0.1, + attention_dropout=0.1, + activation_dropout=0.0, + max_position_embeddings=512, + type_vocab_size=3, + initializer_std=0.02, # Set to a smaller value, so we can keep the small error threshold (1e-5) in the test + num_labels=3, + num_choices=4, + scope=None, + base=False, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.block_sizes = block_sizes + self.num_decoder_layers = num_decoder_layers + self.d_model = d_model + self.n_head = n_head + self.d_head = d_head + self.d_inner = d_inner + self.hidden_act = hidden_act + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = 2 + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.initializer_std = initializer_std + + # Used in the tests to check the size of the first attention layer + self.num_attention_heads = n_head + # Used in the tests to check the size of the first hidden state + self.hidden_size = self.d_model + # Used in the tests to check the number of output hidden states/attentions + self.num_hidden_layers = sum(self.block_sizes) + (0 if base else self.num_decoder_layers) + # FunnelModel adds two hidden layers: input embeddings and the sum of the upsampled encoder hidden state with + # the last hidden state of the first block (which is the first hidden state of the decoder). + if not base: + self.expected_num_hidden_layers = self.num_hidden_layers + 2 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = FunnelConfig( + vocab_size=self.vocab_size, + block_sizes=self.block_sizes, + num_decoder_layers=self.num_decoder_layers, + d_model=self.d_model, + n_head=self.n_head, + d_head=self.d_head, + d_inner=self.d_inner, + hidden_act=self.hidden_act, + hidden_dropout=self.hidden_dropout, + attention_dropout=self.attention_dropout, + activation_dropout=self.activation_dropout, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_std=self.initializer_std, + ) + + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) + + def create_and_check_model( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + model = TFFunnelModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + + inputs = [input_ids, input_mask] + result = model(inputs) + + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model)) + + config.truncate_seq = False + model = TFFunnelModel(config=config) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model)) + + config.separate_cls = False + model = TFFunnelModel(config=config) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model)) + + def create_and_check_base_model( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + model = TFFunnelBaseModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + + inputs = [input_ids, input_mask] + result = model(inputs) + + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model)) + + config.truncate_seq = False + model = TFFunnelBaseModel(config=config) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 3, self.d_model)) + + config.separate_cls = False + model = TFFunnelBaseModel(config=config) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model)) + + def create_and_check_for_pretraining( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + model = TFFunnelForPreTraining(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_masked_lm( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + model = TFFunnelForMaskedLM(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_sequence_classification( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + config.num_labels = self.num_labels + model = TFFunnelForSequenceClassification(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + config.num_choices = self.num_choices + model = TFFunnelForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_for_token_classification( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + config.num_labels = self.num_labels + model = TFFunnelForTokenClassification(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_question_answering( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ): + model = TFFunnelForQuestionAnswering(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + result = model(inputs) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFFunnelModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + TFFunnelModel, + TFFunnelForMaskedLM, + TFFunnelForPreTraining, + TFFunnelForQuestionAnswering, + TFFunnelForTokenClassification, + ) + if is_tf_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": (TFFunnelBaseModel, TFFunnelModel), + "fill-mask": TFFunnelForMaskedLM, + "question-answering": TFFunnelForQuestionAnswering, + "text-classification": TFFunnelForSequenceClassification, + "token-classification": TFFunnelForTokenClassification, + "zero-shot": TFFunnelForSequenceClassification, + } + if is_tf_available() + else {} + ) + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFFunnelModelTester(self) + self.config_tester = ConfigTester(self, config_class=FunnelConfig) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_pretraining(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_pretraining(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + +@require_tf +class TFFunnelBaseModelTest(TFModelTesterMixin, unittest.TestCase): + all_model_classes = ( + (TFFunnelBaseModel, TFFunnelForMultipleChoice, TFFunnelForSequenceClassification) if is_tf_available() else () + ) + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFFunnelModelTester(self, base=True) + self.config_tester = ConfigTester(self, config_class=FunnelConfig) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_base_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_base_model(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) diff --git a/docs/transformers/tests/models/funnel/test_tokenization_funnel.py b/docs/transformers/tests/models/funnel/test_tokenization_funnel.py new file mode 100644 index 0000000000000000000000000000000000000000..2d04fc0ac2d3cbb933b8b314c0019c2e0a07faa1 --- /dev/null +++ b/docs/transformers/tests/models/funnel/test_tokenization_funnel.py @@ -0,0 +1,92 @@ +# Copyright 2020 HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import unittest +from functools import lru_cache + +from transformers import FunnelTokenizer, FunnelTokenizerFast +from transformers.models.funnel.tokenization_funnel import VOCAB_FILES_NAMES +from transformers.testing_utils import require_tokenizers + +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible + + +@require_tokenizers +class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "funnel-transformer/small" + tokenizer_class = FunnelTokenizer + rust_tokenizer_class = FunnelTokenizerFast + test_rust_tokenizer = True + space_between_special_tokens = True + + @classmethod + def setUpClass(cls): + super().setUpClass() + + vocab_tokens = [ + "", + "", + "", + "want", + "##want", + "##ed", + "wa", + "un", + "runn", + "##ing", + ",", + "low", + "lowest", + ] + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + pretrained_name = pretrained_name or cls.tmpdirname + return FunnelTokenizer.from_pretrained(pretrained_name, **kwargs) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + pretrained_name = pretrained_name or cls.tmpdirname + return FunnelTokenizerFast.from_pretrained(pretrained_name, **kwargs) + + def get_input_output_texts(self, tokenizer): + input_text = "UNwant\u00e9d,running" + output_text = "unwanted, running" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = self.tokenizer_class(self.vocab_file) + + tokens = tokenizer.tokenize("UNwant\u00e9d,running") + self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + + def test_token_type_ids(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + inputs = tokenizer("UNwant\u00e9d,running") + sentence_len = len(inputs["input_ids"]) - 1 + self.assertListEqual(inputs["token_type_ids"], [2] + [0] * sentence_len) + + inputs = tokenizer("UNwant\u00e9d,running", "UNwant\u00e9d,running") + self.assertListEqual(inputs["token_type_ids"], [2] + [0] * sentence_len + [1] * sentence_len) diff --git a/docs/transformers/tests/models/fuyu/__init__.py b/docs/transformers/tests/models/fuyu/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/fuyu/test_image_processing_fuyu.py b/docs/transformers/tests/models/fuyu/test_image_processing_fuyu.py new file mode 100644 index 0000000000000000000000000000000000000000..fd9fea1f741a2d27f93fb1ea0e4fd63b1c10c599 --- /dev/null +++ b/docs/transformers/tests/models/fuyu/test_image_processing_fuyu.py @@ -0,0 +1,63 @@ +import unittest + +import numpy as np + +from transformers import is_torch_available, is_vision_available +from transformers.testing_utils import ( + require_torch, + require_torchvision, + require_vision, +) + + +if is_torch_available() and is_vision_available(): + import torch + + from transformers import FuyuImageProcessor + +if is_vision_available(): + from PIL import Image + + +@require_torch +@require_vision +@require_torchvision +class TestFuyuImageProcessor(unittest.TestCase): + def setUp(self): + self.size = {"height": 160, "width": 320} + self.processor = FuyuImageProcessor(size=self.size, padding_value=1.0) + self.batch_size = 3 + self.channels = 3 + self.height = 300 + self.width = 300 + + self.image_input = torch.rand(self.batch_size, self.channels, self.height, self.width) + + self.image_patch_dim_h = 30 + self.image_patch_dim_w = 30 + self.sample_image = np.zeros((450, 210, 3), dtype=np.uint8) + self.sample_image_pil = Image.fromarray(self.sample_image) + + def test_patches(self): + expected_num_patches = self.processor.get_num_patches(image_height=self.height, image_width=self.width) + + patches_final = self.processor.patchify_image(image=self.image_input) + assert patches_final.shape[1] == expected_num_patches, ( + f"Expected {expected_num_patches} patches, got {patches_final.shape[1]}." + ) + + def test_scale_to_target_aspect_ratio(self): + # (h:450, w:210) fitting (160, 320) -> (160, 210*160/450) + scaled_image = self.processor.resize(self.sample_image, size=self.size) + self.assertEqual(scaled_image.shape[0], 160) + self.assertEqual(scaled_image.shape[1], 74) + + def test_apply_transformation_numpy(self): + transformed_image = self.processor.preprocess(self.sample_image).images[0][0] + self.assertEqual(transformed_image.shape[1], 160) + self.assertEqual(transformed_image.shape[2], 320) + + def test_apply_transformation_pil(self): + transformed_image = self.processor.preprocess(self.sample_image_pil).images[0][0] + self.assertEqual(transformed_image.shape[1], 160) + self.assertEqual(transformed_image.shape[2], 320) diff --git a/docs/transformers/tests/models/fuyu/test_modeling_fuyu.py b/docs/transformers/tests/models/fuyu/test_modeling_fuyu.py new file mode 100644 index 0000000000000000000000000000000000000000..06f0171e46aa29ff5d117d2ac2f62bc742e41a4a --- /dev/null +++ b/docs/transformers/tests/models/fuyu/test_modeling_fuyu.py @@ -0,0 +1,300 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Fuyu model.""" + +import io +import unittest + +import pytest +import requests +from parameterized import parameterized + +from transformers import FuyuConfig, is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_torch_accelerator, slow +from transformers.utils import cached_property + +from ...generation.test_utils import GenerationTesterMixin +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_vision_available(): + from PIL import Image + + +if is_torch_available() and is_vision_available(): + from transformers import FuyuProcessor + + +if is_torch_available(): + from transformers import FuyuForCausalLM + + +class FuyuModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + image_size=30, + patch_size=15, + num_channels=3, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + sequence_labels = None + token_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + + config = self.get_config() + + return config, input_ids, input_mask, sequence_labels, token_labels + + def get_config(self): + return FuyuConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + input_mask, + sequence_labels, + token_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class FuyuModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (FuyuForCausalLM,) if is_torch_available() else () + pipeline_model_mapping = ( + {"text-generation": FuyuForCausalLM, "image-text-to-text": FuyuForCausalLM} if is_torch_available() else {} + ) + + test_head_masking = False + test_pruning = False + test_cpu_offload = False + test_disk_offload = False + test_model_parallel = False + + def setUp(self): + self.model_tester = FuyuModelTester(self) + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @parameterized.expand([("random",), ("same",)]) + @pytest.mark.generate + @unittest.skip("Fuyu doesn't support assisted generation due to the need to crop/extend image patches indices") + def test_assisted_decoding_matches_greedy_search(self): + pass + + @pytest.mark.generate + @unittest.skip("Fuyu doesn't support assisted generation due to the need to crop/extend image patches indices") + def test_assisted_decoding_sample(self): + pass + + # TODO: Fix me (once this model gets more usage) + @unittest.skip(reason="Does not work on the tiny model.") + def test_disk_offload_bin(self): + super().test_disk_offload() + + # TODO: Fix me (once this model gets more usage) + @unittest.skip(reason="Does not work on the tiny model.") + def test_disk_offload_safetensors(self): + super().test_disk_offload() + + # TODO: Fix me (once this model gets more usage) + @unittest.skip(reason="Does not work on the tiny model.") + def test_model_parallelism(self): + super().test_model_parallelism() + + @unittest.skip(reason="Fuyu `prepare_inputs_for_generation` function doesn't have cache position.") + def test_generate_continue_from_inputs_embeds(): + pass + + +@slow +@require_torch_accelerator +class FuyuModelIntegrationTest(unittest.TestCase): + @cached_property + def default_processor(self): + return FuyuProcessor.from_pretrained("adept/fuyu-8b") + + @cached_property + def default_model(self): + return FuyuForCausalLM.from_pretrained("adept/fuyu-8b") + + def test_greedy_generation(self): + processor = self.default_processor + model = self.default_model + + url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png" + image = Image.open(io.BytesIO(requests.get(url).content)) + + text_prompt_coco_captioning = "Generate a coco-style caption.\n" + + inputs = processor(images=image, text=text_prompt_coco_captioning, return_tensors="pt") + generated_ids = model.generate(**inputs, max_new_tokens=10) + + # take the last 8 tokens (in order to skip special \n\x04 characters) and decode them + generated_text = processor.batch_decode(generated_ids[:, -8:], skip_special_tokens=True)[0] + self.assertEqual(generated_text, "A blue bus parked on the side of a road.") + + +""" + @slow + @require_torch_accelerator + def test_model_8b_chat_greedy_generation_bus_color(self): + EXPECTED_TEXT_COMPLETION = "The bus is blue.\n|ENDOFTEXT|" + text_prompt_bus_color = "What color is the bus?\n" + model_inputs_bus_color = self.processor(text=text_prompt_bus_color, images=self.bus_image_pil) + + generated_tokens = self.model.generate(**model_inputs_bus_color, max_new_tokens=10) + text = self.processor.tokenizer.batch_decode(generated_tokens) + end_sequence = text[0].split("\x04")[1] + clean_sequence = ( + end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")] + if "|ENDOFTEXT|" in end_sequence + else end_sequence + ) + self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence) + + @slow + @require_torch_accelerator + def test_model_8b_chat_greedy_generation_chart_vqa(self): + EXPECTED_TEXT_TOKENS = ["The","life expectancy","at","birth","of male","s in","","20","18","is","","80",".","7",".","\n","|ENDOFTEXT|",] # fmt: skip + expected_text_completion = " ".join(EXPECTED_TEXT_TOKENS) # TODO make sure the end string matches + + text_prompt_chart_vqa = "What is the highest life expectancy at birth of male?\n" + + chart_image_url = ( + "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/chart.png" + ) + chart_image_pil = Image.open(io.BytesIO(requests.get(chart_image_url).content)) + + model_inputs_chart_vqa = self.processor(text=text_prompt_chart_vqa, images=chart_image_pil) + generated_tokens = self.model.generate(**model_inputs_chart_vqa, max_new_tokens=10) + text = self.processor.tokenizer.batch_decode(generated_tokens) + end_sequence = text[0].split("\x04")[1] + clean_sequence = ( + end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")] + if "|ENDOFTEXT|" in end_sequence + else end_sequence + ) + self.assertEqual(expected_text_completion, clean_sequence) + + @slow + @require_torch_accelerator + def test_model_8b_chat_greedy_generation_bounding_box(self): + EXPECTED_TEXT_COMPLETION = "\x00194213202244\x01|ENDOFTEXT|" + text_prompt_bbox = "When presented with a box, perform OCR to extract text contained within it. If provided with text, generate the corresponding bounding box.\\nWilliams" # noqa: E231 + + bbox_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bbox_sample_image.png" + bbox_image_pil = Image.open(io.BytesIO(requests.get(bbox_image_url).content)) + + model_inputs_bbox = self.processor(text=text_prompt_bbox, images=bbox_image_pil) + generated_tokens = self.model.generate(**model_inputs_bbox, max_new_tokens=10) + text = self.processor.tokenizer.batch_decode(generated_tokens) + end_sequence = text[0].split("\x04")[1] + clean_sequence = ( + end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")] + if "|ENDOFTEXT|" in end_sequence + else end_sequence + ) + self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence) +""" diff --git a/docs/transformers/tests/models/fuyu/test_processor_fuyu.py b/docs/transformers/tests/models/fuyu/test_processor_fuyu.py new file mode 100644 index 0000000000000000000000000000000000000000..1f2c754bd597a71b65633a1e8c052b492ffbba79 --- /dev/null +++ b/docs/transformers/tests/models/fuyu/test_processor_fuyu.py @@ -0,0 +1,409 @@ +import io +import tempfile +import unittest +from shutil import rmtree + +import requests + +from transformers import ( + AutoProcessor, + AutoTokenizer, + FuyuImageProcessor, + FuyuProcessor, + is_torch_available, + is_vision_available, +) +from transformers.testing_utils import require_torch, require_vision + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from PIL import Image + + +if is_torch_available(): + import torch + + from transformers.models.fuyu.processing_fuyu import construct_full_unpacked_stream, full_unpacked_stream_to_tensor + + +@require_torch +@require_vision +class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = FuyuProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + image_processor = FuyuImageProcessor() + tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b") + + processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer) + processor.save_pretrained(cls.tmpdirname) + + cls.text_prompt = "Generate a coco-style caption.\\n" + bus_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png" + cls.bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content)) + + @classmethod + def tearDownClass(cls): + rmtree(cls.tmpdirname) + + def get_processor(self): + image_processor = FuyuImageProcessor() + tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b") + processor = FuyuProcessor(image_processor, tokenizer, **self.prepare_processor_dict()) + + return processor + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + def test_fuyu_processing(self): + """ + Test to ensure that the standard processing on a gold example matches adept's code. + """ + # fmt: off + EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64) + EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64) + + one_image_bus_model_inputs = self.get_processor()(text=self.text_prompt, images=self.bus_image_pil) + + # fmt: on + torch.testing.assert_close(one_image_bus_model_inputs["image_patches_indices"], EXPECTED_IMAGE_PATCH_INPUTS) + torch.testing.assert_close(one_image_bus_model_inputs["input_ids"], EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS) + + def test_fuyu_processing_no_image(self): + """ + Test to check processor works with just text input + """ + processor_outputs = self.get_processor()(text=self.text_prompt) + tokenizer_outputs = self.get_tokenizer()(self.text_prompt) + self.assertEqual(processor_outputs["input_ids"], tokenizer_outputs["input_ids"]) + + def test_fuyu_processing_no_text(self): + """ + Test to check processor works with just image input + """ + # fmt: off + EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([ + [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, + 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, + 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, + 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, + 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, + 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, + 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, + 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, + 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, + 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, + 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, + 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, + 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, + -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, + 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, + 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, + 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] + ]).to(torch.int64) + # fmt: on + + processor_outputs = self.get_processor()(images=self.bus_image_pil) + self.assertTrue((processor_outputs["image_patches_indices"] == EXPECTED_IMAGE_PATCH_INPUTS).all()) + + def test_fuyu_processing_multiple_image_sample(self): + """ + Test to check processor works with multiple image inputs for a single text input + """ + # fmt: off + SINGLE_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64) + SINGLE_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64) + + SINGLE_RESIZED_IMAGE_PATCH_INPUTS = torch.Tensor([[ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1, 12, 13, 14, -1, 15, 16, 17, -1, 18, 19, 20, -1, 21, 22, 23, -1, 24, 25, 26, -1, 27, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]]) + SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[ 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122]]) + # fmt: on + + # Batch of two images - equally sized + images = [self.bus_image_pil, self.bus_image_pil] + processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images) + + self.assertTrue( + ( + processor_outputs["image_patches_indices"] + == torch.cat([SINGLE_IMAGE_PATCH_INPUTS, SINGLE_IMAGE_PATCH_INPUTS], dim=0) + ).all() + ) + self.assertTrue( + ( + processor_outputs["input_ids"] + == torch.cat([SINGLE_PADDED_UNPACKED_TOKEN_INPUTS, SINGLE_PADDED_UNPACKED_TOKEN_INPUTS], dim=0) + ).all() + ) + + # Processes single images with different sizes as expected + images = [self.bus_image_pil] + processor_outputs = self.get_processor()(text=self.text_prompt, images=images) + self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_IMAGE_PATCH_INPUTS).all()) + self.assertTrue((processor_outputs["input_ids"] == SINGLE_PADDED_UNPACKED_TOKEN_INPUTS).all()) + + images = [self.bus_image_pil.resize((64, 300))] + processor_outputs = self.get_processor()(text=self.text_prompt, images=images) + self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_RESIZED_IMAGE_PATCH_INPUTS).all()) + self.assertTrue((processor_outputs["input_ids"] == SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS).all()) + + # Batch of two images - different sizes. Left-pads the smaller image inputs + images = [self.bus_image_pil, self.bus_image_pil.resize((64, 300))] + processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images) + + padding_len_patch = SINGLE_IMAGE_PATCH_INPUTS.shape[1] - SINGLE_RESIZED_IMAGE_PATCH_INPUTS.shape[1] + padded_single_resized_image_patch = torch.cat( + [torch.ones([1, padding_len_patch]) * -1, SINGLE_RESIZED_IMAGE_PATCH_INPUTS], dim=1 + ) + expected_image_patch_inputs = torch.cat([SINGLE_IMAGE_PATCH_INPUTS, padded_single_resized_image_patch], dim=0) + + padding_len_token = ( + SINGLE_PADDED_UNPACKED_TOKEN_INPUTS.shape[1] - SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS.shape[1] + ) + padded_single_resized_padded_unpacked_token_inputs = torch.cat( + [torch.zeros([1, padding_len_token]), SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS], dim=1 + ) + expected_padded_unpacked_token_inputs = torch.cat( + [SINGLE_PADDED_UNPACKED_TOKEN_INPUTS, padded_single_resized_padded_unpacked_token_inputs], dim=0 + ) + + self.assertTrue((processor_outputs["image_patches_indices"] == expected_image_patch_inputs).all()) + self.assertTrue((processor_outputs["input_ids"] == expected_padded_unpacked_token_inputs).all()) + + # Rewrite as Fuyu supports tokenizer kwargs only when image is None. + @require_vision + @require_torch + def test_kwargs_overrides_default_tokenizer_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = self.prepare_text_inputs() + # Fuyu uses tokenizer kwargs only when image is None. + image_input = None + + inputs = processor( + text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" + ) + self.assertEqual(len(inputs["input_ids"][0]), 112) + + @unittest.skip("Fuyu processor does not support image_processor kwargs") + def test_image_processor_defaults_preserved_by_image_kwargs(self): + pass + + @unittest.skip("Fuyu processor does not support image_processor kwargs") + def test_kwargs_overrides_default_image_processor_kwargs(self): + pass + + # Rewrite as Fuyu supports tokenizer kwargs only when image is None. + @require_vision + @require_torch + def test_tokenizer_defaults_preserved_by_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = self.prepare_text_inputs() + # Fuyu uses tokenizer kwargs only when image is None. + image_input = None + + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(len(inputs["input_ids"][0]), 117) + + # Rewrite as Fuyu image processor does not return pixel values + @require_torch + @require_vision + def test_structured_kwargs_nested(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs() + # Fuyu uses tokenizer kwargs only when image is None. + image_input = None + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.skip_processor_without_typed_kwargs(processor) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + # Rewrite as Fuyu image processor does not return pixel values + @require_torch + @require_vision + def test_structured_kwargs_nested_from_dict(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = self.prepare_text_inputs() + # Fuyu uses tokenizer kwargs only when image is None. + image_input = None + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + # Rewrite as Fuyu supports tokenizer kwargs only when image is None. + @require_torch + @require_vision + def test_unstructured_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs() + # Fuyu uses tokenizer kwargs only when image is None. + image_input = None + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + padding="max_length", + max_length=76, + ) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + # Rewrite as Fuyu supports tokenizer kwargs only when image is None. + @require_torch + @require_vision + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs(batch_size=2) + # Fuyu uses tokenizer kwargs only when image is None. + image_input = None + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + padding="longest", + max_length=76, + ) + + self.assertEqual(len(inputs["input_ids"][0]), 7) + + +@require_torch +class TestImageTextProcessingUtils(unittest.TestCase): + def setUp(self): + self.batch_size = 2 + self.new_seq_len = 8 + self.num_sub_sequences = 1 + + self.all_bi_tokens_to_place = [4, 6] + self.full_unpacked_stream = [torch.tensor([1, 2, 3, 4]), torch.tensor([5, 6, 7, 8, 9, 10])] + self.fill_value = 0 + + self.num_real_text_tokens = [[3, 2], [2, 4]] + # Here the input stream is padded to avoid inconsistencies (current model release matches) + self.input_stream = torch.tensor([[[1, 2, 3], [4, 5, 0]], [[6, 7, 0], [8, 9, 10]]]) + self.image_tokens = [ + [torch.tensor([1, 2]), torch.tensor([3])], + [torch.tensor([4, 5, 6]), torch.tensor([7, 8])], + ] + + def test_full_unpacked_stream_to_tensor(self): + result = full_unpacked_stream_to_tensor( + self.all_bi_tokens_to_place, + self.full_unpacked_stream, + self.fill_value, + self.batch_size, + self.new_seq_len, + offset=0, + ) + EXPECTED_TENSOR = torch.tensor([[1, 2, 3, 4, 0, 0, 0, 0], [5, 6, 7, 8, 9, 10, 0, 0]]) + self.assertTrue(torch.equal(result, EXPECTED_TENSOR)) + + def test_construct_full_unpacked_stream(self): + result = construct_full_unpacked_stream( + self.num_real_text_tokens, self.input_stream, self.image_tokens, self.batch_size, self.num_sub_sequences + ) + EXPECTED_UNPACKED_STREAM = [torch.tensor([1, 2, 1, 2, 3]), torch.tensor([4, 5, 6, 6, 7])] + for i in range(len(result)): + self.assertTrue(torch.equal(result[i], EXPECTED_UNPACKED_STREAM[i])) + + +@require_torch +class TestProcessImagesForModelInput(unittest.TestCase): + def setUp(self): + """ + Adding a mix of present and absent images. + """ + + self.image_input = torch.randn([1, 1, 3, 64, 64]) + self.image_present = torch.tensor([[1]]) + self.image_unpadded_h = torch.tensor([[45]]) # Adjusted for subsequence of 1 + self.image_unpadded_w = torch.tensor([[50]]) # Adjusted for subsequence of 1 + self.image_patch_dim_h = 16 + self.image_patch_dim_w = 16 + self.image_placeholder_id = 999 + self.image_newline_id = 888 + self.variable_sized = True + self.image_processor = FuyuImageProcessor( + patch_size={"height": self.image_patch_dim_h, "width": self.image_patch_dim_w} + ) + + def test_process_images_for_model_input_fixed_sized(self): + self.variable_sized = False + result = self.image_processor.preprocess_with_tokenizer_info( + image_input=self.image_input, + image_present=self.image_present, + image_unpadded_h=self.image_unpadded_h, + image_unpadded_w=self.image_unpadded_w, + image_placeholder_id=self.image_placeholder_id, + image_newline_id=self.image_newline_id, + variable_sized=self.variable_sized, + ) + self.assertEqual(result["images"][0][0].shape, torch.Size([3, 64, 64])) diff --git a/docs/transformers/tests/models/gemma/__init__.py b/docs/transformers/tests/models/gemma/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/gemma/test_modeling_flax_gemma.py b/docs/transformers/tests/models/gemma/test_modeling_flax_gemma.py new file mode 100644 index 0000000000000000000000000000000000000000..8bd5a5bb416c4e5bc14946b64847cccb70dbbea1 --- /dev/null +++ b/docs/transformers/tests/models/gemma/test_modeling_flax_gemma.py @@ -0,0 +1,264 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np + +from transformers import AutoTokenizer, GemmaConfig, is_flax_available +from transformers.testing_utils import require_flax, require_read_token, slow + +from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor + + +if is_flax_available(): + import jax + import jax.numpy as jnp + + from transformers.models.gemma.modeling_flax_gemma import ( + FlaxGemmaForCausalLM, + FlaxGemmaModel, + ) + + +class FlaxGemmaModelTester: + def __init__( + self, + parent, + batch_size=2, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + num_key_value_heads=2, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = None + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = np.tril(np.ones((self.batch_size, self.seq_length))) + + config = GemmaConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + num_key_value_heads=self.num_key_value_heads, + head_dim=self.hidden_size // self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + use_cache=True, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + return config, input_ids, input_mask + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + return config, inputs_dict + + def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask): + max_decoder_length = 20 + model = model_class_name(config) + + past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length) + attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4") + + position_ids = jnp.broadcast_to( + jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1) + ) + outputs_cache = model( + input_ids[:, :-1], + attention_mask=attention_mask, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4") + outputs_cache_next = model( + input_ids[:, -1:], + attention_mask=attention_mask, + past_key_values=outputs_cache.past_key_values, + position_ids=position_ids, + ) + + outputs = model(input_ids) + + diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])) + self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}") + + def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask): + max_decoder_length = 20 + model = model_class_name(config) + + attention_mask_cache = jnp.concatenate( + [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))], + axis=-1, + ) + + past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length) + position_ids = jnp.broadcast_to( + jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1) + ) + + outputs_cache = model( + input_ids[:, :-1], + attention_mask=attention_mask_cache, + past_key_values=past_key_values, + position_ids=position_ids, + ) + position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4") + outputs_cache_next = model( + input_ids[:, -1:], + past_key_values=outputs_cache.past_key_values, + attention_mask=attention_mask_cache, + position_ids=position_ids, + ) + + outputs = model(input_ids, attention_mask=attention_mask) + + diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])) + self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}") + + +@require_flax +class FlaxGemmaModelTest(FlaxModelTesterMixin, unittest.TestCase): + all_model_classes = (FlaxGemmaModel, FlaxGemmaForCausalLM) if is_flax_available() else () + + def setUp(self): + self.model_tester = FlaxGemmaModelTester(self) + + def test_use_cache_forward(self): + for model_class_name in self.all_model_classes: + config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask) + + def test_use_cache_forward_with_attn_mask(self): + for model_class_name in self.all_model_classes: + config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_use_cache_forward_with_attn_mask( + model_class_name, config, input_ids, attention_mask + ) + + @slow + def test_model_from_pretrained(self): + for model_class_name in self.all_model_classes: + model = model_class_name.from_pretrained("google/gemma-2b", from_pt=True) + outputs = model(np.ones((1, 1))) + self.assertIsNotNone(outputs) + + +@slow +@require_flax +@require_read_token +class FlaxGemmaIntegrationTest(unittest.TestCase): + input_text = ["The capital of France is", "To play the perfect cover drive"] + model_id = "google/gemma-2b" + revision = "flax" + + def setUp(self): + self.model, self.params = FlaxGemmaForCausalLM.from_pretrained( + self.model_id, revision=self.revision, _do_init=False + ) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id) + self.tokenizer.padding_side = "left" + + def test_logits(self): + inputs = self.tokenizer(self.input_text, return_tensors="np", padding=True) + # fmt: off + EXPECTED_MEAN = [ + [-16.427, -21.386, -35.491, -36.258, -31.401, -36.370, -37.598], + [-21.386, -32.150, -33.155, -34.344, -34.706, -34.678, -38.495], + ] + EXPECTED_SLICE = [-33.462, -16.481, -30.837, -32.195, -33.113] + # fmt: on + + logits = self.model(**inputs, params=self.params).logits + + diff_mean = jnp.abs(logits.mean(-1) - np.array(EXPECTED_MEAN)).max() + diff_slice = jnp.abs(logits[0, -1, 475:480] - np.array(EXPECTED_SLICE)).max() + + self.assertAlmostEqual(diff_mean, 0, places=3) + self.assertAlmostEqual(diff_slice, 0, places=3) + + def test_generation(self): + EXPECTED_TEXTS = [ + "The capital of France is a city of contrasts. It is a city of history, of art, of culture, of fashion", + "To play the perfect cover drive, you need to have a good technique and a good mindset.\n\nThe cover drive is a shot", + ] + inputs = self.tokenizer(self.input_text, return_tensors="np", padding=True) + + output = self.model.generate(**inputs, params=self.params, max_new_tokens=20, do_sample=False) + output_text = self.tokenizer.batch_decode(output.sequences, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_jit_generation(self): + EXPECTED_TEXTS = [ + "The capital of France is a city of contrasts. It is a city of history, culture, and art, but it is", + "To play the perfect cover drive, you need to have a good technique and a good mindset.\n\nThe cover drive is a shot", + ] + inputs = self.tokenizer(self.input_text, return_tensors="np", padding=True) + + def generate(input_ids, attention_mask): + outputs = self.model.generate( + input_ids, attention_mask=attention_mask, params=self.params, max_new_tokens=20, do_sample=False + ) + return outputs + + jit_generate = jax.jit(generate) + output_sequences = jit_generate(**inputs).sequences + output_text = self.tokenizer.batch_decode(output_sequences, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) diff --git a/docs/transformers/tests/models/gemma/test_modeling_gemma.py b/docs/transformers/tests/models/gemma/test_modeling_gemma.py new file mode 100644 index 0000000000000000000000000000000000000000..ce0aadd16379fd852b1c0dc44cfb8e2831e6a770 --- /dev/null +++ b/docs/transformers/tests/models/gemma/test_modeling_gemma.py @@ -0,0 +1,761 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Gemma model.""" + +import tempfile +import unittest + +import pytest +from packaging import version + +from transformers import AutoModelForCausalLM, AutoTokenizer, GemmaConfig, is_torch_available +from transformers.generation.configuration_utils import GenerationConfig +from transformers.testing_utils import ( + cleanup, + is_flaky, + require_bitsandbytes, + require_flash_attn, + require_read_token, + require_torch, + require_torch_accelerator, + require_torch_gpu, + require_torch_sdpa, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + GemmaForCausalLM, + GemmaForSequenceClassification, + GemmaForTokenClassification, + GemmaModel, + ) + + +@require_torch +class GemmaModelTester: + config_class = GemmaConfig + if is_torch_available(): + model_class = GemmaModel + for_causal_lm_class = GemmaForCausalLM + for_sequence_class = GemmaForSequenceClassification + for_token_class = GemmaForTokenClassification + + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + num_key_value_heads=2, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + self.head_dim = self.hidden_size // self.num_attention_heads + + # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTester.prepare_config_and_inputs + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return self.config_class( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + num_key_value_heads=self.num_key_value_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + head_dim=self.head_dim, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = self.model_class(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Gemma + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + (GemmaModel, GemmaForCausalLM, GemmaForSequenceClassification, GemmaForTokenClassification) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": GemmaModel, + "text-classification": GemmaForSequenceClassification, + "token-classification": GemmaForTokenClassification, + "text-generation": GemmaForCausalLM, + "zero-shot": GemmaForSequenceClassification, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + + # Need to remove 0.9 in `test_cpu_offload` + # This is because we are hitting edge cases with the causal_mask buffer + model_split_percents = [0.5, 0.6] + + # used in `test_torch_compile_for_training` + _torch_compile_train_cls = GemmaForCausalLM if is_torch_available() else None + + # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146 + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + return True + + def setUp(self): + self.model_tester = GemmaModelTester(self) + self.config_tester = ConfigTester(self, config_class=GemmaConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_Gemma_sequence_classification_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = self.model_tester.for_sequence_class(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_Gemma_sequence_classification_model_for_single_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "single_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = self.model_tester.for_sequence_class(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_Gemma_sequence_classification_model_for_multi_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "multi_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor( + [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size + ).to(torch.float) + model = self.model_tester.for_sequence_class(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_Gemma_token_classification_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels) + model = self.model_tester.for_token_class(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=token_labels) + self.assertEqual( + result.logits.shape, + (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels), + ) + + @require_flash_attn + @require_torch_gpu + @pytest.mark.flash_attn_test + @slow + def test_flash_attn_2_inference_equivalence_right_padding(self): + self.skipTest(reason="Gemma flash attention does not support right padding") + + @require_torch_sdpa + @require_torch_accelerator + @slow + def test_sdpa_equivalence(self): + for model_class in self.all_model_classes: + if not model_class._supports_sdpa: + self.skipTest(reason="Model does not support SDPA") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.float16, attn_implementation="sdpa" + ) + model_sdpa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, attn_implementation="eager") + model.to(torch_device) + + dummy_input = inputs_dict[model_class.main_input_name] + dummy_input = dummy_input.to(torch_device) + outputs = model(dummy_input, output_hidden_states=True) + outputs_sdpa = model_sdpa(dummy_input, output_hidden_states=True) + + logits = outputs.hidden_states[-1] + logits_sdpa = outputs_sdpa.hidden_states[-1] + + # gemma sdpa needs a high tolerance + assert torch.allclose(logits_sdpa, logits, atol=3e-3) + + @require_flash_attn + @require_torch_gpu + @pytest.mark.flash_attn_test + @is_flaky() + @slow + def test_flash_attn_2_equivalence(self): + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(reason="Model does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, attn_implementation="eager") + model.to(torch_device) + + dummy_input = inputs_dict[model_class.main_input_name] + dummy_input = dummy_input.to(torch_device) + outputs = model(dummy_input, output_hidden_states=True) + outputs_fa = model_fa(dummy_input, output_hidden_states=True) + + logits = outputs.hidden_states[-1] + logits_fa = outputs_fa.hidden_states[-1] + + # gemma flash attention 2 needs a high tolerance + assert torch.allclose(logits_fa, logits, atol=3e-3) + + +@slow +@require_torch_accelerator +class GemmaIntegrationTest(unittest.TestCase): + input_text = ["Hello I am doing", "Hi today"] + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + def tearDown(self): + # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed. + cleanup(torch_device, gc_collect=False) + + @require_read_token + def test_model_2b_fp16(self): + model_id = "google/gemma-2b" + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( + torch_device + ) + + model.generation_config.cache_implementation = "static" + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + def test_model_2b_bf16(self): + model_id = "google/gemma-2b" + + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Khichdi", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( + torch_device + ) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + def test_model_2b_eager(self): + model_id = "google/gemma-2b" + + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Khichdi", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager" + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_torch_sdpa + @require_read_token + def test_model_2b_sdpa(self): + model_id = "google/gemma-2b" + + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Khichdi", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa" + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_flash_attn + @require_read_token + @pytest.mark.flash_attn_test + def test_model_2b_flash_attn(self): + model_id = "google/gemma-2b" + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_bitsandbytes + @require_read_token + def test_model_2b_4bit(self): + model_id = "google/gemma-2b" + EXPECTED_TEXTS = [ + "Hello I am doing a project and I need to make a 3d model of a house. I have been using", + "Hi today I'd like to share with you my experience with the new wattpad wattpad wattpad wattpad wattpad wattpad wattpad", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @unittest.skip(reason="The test will not fit our CI runners") + @require_read_token + def test_model_7b_fp32(self): + model_id = "google/gemma-7b" + EXPECTED_TEXTS = [ + "Hello my name is ***** ***** I will be assisting you today. I am sorry to hear about your issue. I will", + "Hi,\n\nI have a problem with my 2005 1.6 16", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + def test_model_7b_fp16(self): + if self.cuda_compute_capability_major_version == 7: + self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).") + + model_id = "google/gemma-7b" + EXPECTED_TEXTS = [ + """Hello I am doing a project on a 1999 4.0L 4x4. I""", + "Hi today I am going to show you how to make a simple and easy to make a DIY 3D", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( + torch_device + ) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + def test_model_7b_bf16(self): + if self.cuda_compute_capability_major_version == 7: + self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).") + + model_id = "google/gemma-7b" + + # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4. + # + # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s, + # considering differences in hardware processing and potential deviations in generated text. + EXPECTED_TEXTS = { + 7: [ + """Hello I am doing a project on a 1991 240sx and I am trying to find""", + "Hi today I am going to show you how to make a very simple and easy to make a very simple and", + ], + 8: [ + "Hello I am doing a project for my school and I am trying to make a program that will read a .txt file", + "Hi today I am going to show you how to make a very simple and easy to make a very simple and", + ], + 9: [ + "Hello I am doing a project for my school and I am trying to get a servo to move a certain amount of degrees", + "Hi today I am going to show you how to make a very simple and easy to make DIY light up sign", + ], + } + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( + torch_device + ) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) + + @require_read_token + def test_model_7b_fp16_static_cache(self): + if self.cuda_compute_capability_major_version == 7: + self.skipTest("This test is failing (`torch.compile` fails) on Nvidia T4 GPU (OOM).") + + model_id = "google/gemma-7b" + EXPECTED_TEXTS = [ + """Hello I am doing a project on a 1999 4.0L 4x4. I""", + "Hi today I am going to show you how to make a simple and easy to make a DIY 3D", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( + torch_device + ) + + model.generation_config.cache_implementation = "static" + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_bitsandbytes + @require_read_token + def test_model_7b_4bit(self): + model_id = "google/gemma-7b" + EXPECTED_TEXTS = [ + "Hello I am doing a project for my school and I am trying to make a program that will take a number and then", + "Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + self.assertEqual(output_text, EXPECTED_TEXTS) + + @slow + @require_torch_accelerator + @require_read_token + def test_compile_static_cache(self): + # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2 + # work as intended. See https://github.com/pytorch/pytorch/issues/121943 + if version.parse(torch.__version__) < version.parse("2.3.0"): + self.skipTest(reason="This test requires torch >= 2.3 to run.") + + NUM_TOKENS_TO_GENERATE = 40 + EXPECTED_TEXT_COMPLETION = [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found", + "Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the", + ] + + prompts = ["Hello I am doing", "Hi today"] + tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", pad_token="", padding_side="right") + model = GemmaForCausalLM.from_pretrained("google/gemma-2b", device_map=torch_device, torch_dtype=torch.float16) + inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + + # Dynamic Cache + generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False) + dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, dynamic_text) # Both GPU architectures have the same output + + # Static Cache + generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static" + ) + static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text) + + # Static Cache + compile + model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) + generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static" + ) + static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text) + + @slow + @require_read_token + def test_export_static_cache(self): + if version.parse(torch.__version__) < version.parse("2.3.0"): + self.skipTest(reason="This test requires torch >= 2.3 to run.") + + from transformers.integrations.executorch import ( + TorchExportableModuleWithStaticCache, + convert_and_export_with_cache, + ) + + tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", pad_token="", padding_side="right") + EXPECTED_TEXT_COMPLETION = [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found", + ] + max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[ + "input_ids" + ].shape[-1] + + # Load model + device = "cpu" + dtype = torch.bfloat16 + cache_implementation = "static" + attn_implementation = "sdpa" + batch_size = 1 + model = GemmaForCausalLM.from_pretrained( + "google/gemma-2b", + device_map=device, + torch_dtype=dtype, + attn_implementation=attn_implementation, + generation_config=GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=max_generation_length, + cache_config={ + "batch_size": batch_size, + "max_cache_len": max_generation_length, + }, + ), + ) + + prompts = ["Hello I am doing"] + prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + prompt_token_ids = prompt_tokens["input_ids"] + max_new_tokens = max_generation_length - prompt_token_ids.shape[-1] + + # Static Cache + eager + eager_generated_ids = model.generate( + **prompt_tokens, max_new_tokens=max_new_tokens, do_sample=False, cache_implementation=cache_implementation + ) + eager_generated_text = tokenizer.batch_decode(eager_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, eager_generated_text) + + # Static Cache + export + exported_program = convert_and_export_with_cache(model) + ep_generated_ids = TorchExportableModuleWithStaticCache.generate( + exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens + ) + ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text) + + def test_model_2b_bf16_dola(self): + model_id = "google/gemma-2b" + # ground truth text generated with dola_layers="low", repetition_penalty=1.2 + EXPECTED_TEXTS = [ + "Hello I am doing an experiment and need to get the mass of a block. The problem is, it has no scale", + "Hi today we have the review for a 2016/2017 season of", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( + torch_device + ) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate( + **inputs, max_new_tokens=20, do_sample=False, dola_layers="low", repetition_penalty=1.2 + ) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + self.assertEqual(output_text, EXPECTED_TEXTS) diff --git a/docs/transformers/tests/models/gemma/test_tokenization_gemma.py b/docs/transformers/tests/models/gemma/test_tokenization_gemma.py new file mode 100644 index 0000000000000000000000000000000000000000..b56a8e95914e13f991c59166dd11a4870924cb8f --- /dev/null +++ b/docs/transformers/tests/models/gemma/test_tokenization_gemma.py @@ -0,0 +1,544 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +import unittest + +from datasets import load_dataset + +from transformers import ( + AddedToken, + GemmaTokenizer, + GemmaTokenizerFast, +) +from transformers.convert_slow_tokenizer import convert_slow_tokenizer +from transformers.testing_utils import ( + get_tests_dir, + nested_simplify, + require_jinja, + require_read_token, + require_sentencepiece, + require_tokenizers, + require_torch, + slow, +) + +from ...test_tokenization_common import TokenizerTesterMixin + + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + + +@require_sentencepiece +@require_tokenizers +class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/gemma-7b" + tokenizer_class = GemmaTokenizer + rust_tokenizer_class = GemmaTokenizerFast + + test_rust_tokenizer = False + test_sentencepiece = True + from_pretrained_kwargs = {} + + @classmethod + def setUpClass(cls): + super().setUpClass() + # We have a SentencePiece fixture for testing + tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True) + tokenizer.pad_token = tokenizer.eos_token + tokenizer.save_pretrained(cls.tmpdirname) + + @require_torch + def test_batch_tokenization(self): + if not self.test_seq2seq: + self.skipTest(reason="test_seq2seq is set to False") + + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Longer text that will definitely require truncation. + text = [ + " UN Chief Says There Is No Military Solution in Syria", + " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for" + " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons" + " will only worsen the violence and misery for millions of people.", + ] + try: + batch = tokenizer( + text=text, + max_length=3, + max_target_length=10, + return_tensors="pt", + ) + except NotImplementedError: + self.skipTest(reason="Encountered NotImplementedError when calling tokenizer") + self.assertEqual(batch.input_ids.shape[1], 3) + # max_target_length will default to max_length if not specified + batch = tokenizer(text, max_length=3, return_tensors="pt") + self.assertEqual(batch.input_ids.shape[1], 3) + + batch_encoder_only = tokenizer(text=text, max_length=3, max_target_length=10, return_tensors="pt") + self.assertEqual(batch_encoder_only.input_ids.shape[1], 3) + self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3) + self.assertNotIn("decoder_input_ids", batch_encoder_only) + + @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.") + def test_save_slow_from_fast_and_reload_fast(self): + pass + + def test_special_tokens_initialization(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + added_tokens = [AddedToken("", lstrip=True)] + + tokenizer_r = self.get_rust_tokenizer( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + r_output = tokenizer_r.encode("Hey this is a token") + + special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0] + + self.assertTrue(special_token_id in r_output) + + if self.test_slow_tokenizer: + tokenizer_cr = self.get_rust_tokenizer( + pretrained_name, + additional_special_tokens=added_tokens, + **kwargs, # , from_slow=True <- unfortunately too slow to convert + ) + tokenizer_p = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + + p_output = tokenizer_p.encode("Hey this is a token") + + cr_output = tokenizer_cr.encode("Hey this is a token") + + self.assertEqual(p_output, r_output) + self.assertEqual(cr_output, r_output) + self.assertTrue(special_token_id in p_output) + self.assertTrue(special_token_id in cr_output) + + @slow + @require_read_token + def test_tokenizer_integration(self): + expected_encoding = {'input_ids': [[2, 158434, 591, 84193, 3836, 685, 6599, 31223, 235290, 140247, 578, 6599, 31223, 235290, 145139, 235290, 3491, 235275, 6572, 3311, 235290, 38197, 109959, 591, 25894, 235269, 162174, 235290, 235284, 235269, 1791, 6362, 12481, 235269, 1576, 18622, 235269, 2900, 1136, 86684, 235269, 29092, 4632, 16994, 604, 13146, 14944, 40371, 591, 19700, 235327, 235275, 578, 13146, 14944, 25511, 591, 235300, 12474, 235275, 675, 1163, 235248, 235304, 235284, 235340, 229903, 5377, 575, 235248, 235274, 235276, 235276, 235340, 17044, 578, 5271, 1061, 118345, 1865, 125247, 235269, 8745, 111226, 578, 176888, 235265], [2, 25894, 603, 6869, 577, 953, 235290, 8297, 5271, 209099, 41642, 774, 748, 78253, 2793, 731, 51506, 34346, 611, 2145, 2731, 578, 1833, 4807, 575, 832, 16630, 235265], [2, 651, 4320, 8426, 25341, 36271, 1163, 573, 27894, 5929, 235265]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip + self.tokenizer_integration_test_util( + expected_encoding=expected_encoding, + model_name="google/gemma-2b", + padding=False, + ) + + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") + def test_pickle_subword_regularization_tokenizer(self): + pass + + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") + def test_subword_regularization_tokenizer(self): + pass + + @unittest.skip(reason="Skipping") + def test_torch_encode_plus_sent_to_model(self): + pass + + +@require_torch +@require_sentencepiece +@require_tokenizers +class GemmaIntegrationTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + checkpoint_name = "hf-internal-testing/dummy-gemma" + cls.tokenizer: GemmaTokenizer = GemmaTokenizer.from_pretrained( + checkpoint_name, eos_token="" + ) # add this token + cls.rust_tokenizer = GemmaTokenizerFast.from_pretrained( + checkpoint_name, eos_token="", from_slow=True + ) # add this token + return cls + + @require_torch + def integration_tests(self): + inputs = self.tokenizer( + ["The following string should be properly encoded: Hello.", "But ird and ปี ird ด"], + return_tensors="pt", + ) + + self.assertEqual( + nested_simplify(inputs), + { + "input_ids": [ + [2, 450, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889], + [2, 1205, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718], + ], + "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], + }, + ) + + def test_user_added_tokens(self): + # Ensure that user added tokens are not split in the fast tokenizer + slow_tokenizer = self.tokenizer + fast_tokenizer = self.rust_tokenizer + + user_added_token = "" + + slow_tokens = slow_tokenizer.convert_ids_to_tokens(slow_tokenizer.encode(user_added_token)) + fast_tokens = slow_tokenizer.convert_ids_to_tokens(fast_tokenizer.encode(user_added_token)) + + self.assertTrue(user_added_token in fast_tokens) + self.assertEqual(slow_tokens, fast_tokens) + + def test_fast_special_tokens(self): + slow_tokenizer = self.tokenizer + fast_tokenizer = self.rust_tokenizer + slow = slow_tokenizer.encode("A sample test", add_special_tokens=True) + assert slow == [2, 235280, 6453, 2121] + + fast_tokenizer.add_eos_token = False + fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) + assert fast == [2, 235280, 6453, 2121] + + fast_tokenizer.add_eos_token = True + fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) + assert fast == [2, 235280, 6453, 2121, 204] + + slow_tokenizer.add_eos_token = True + slow = slow_tokenizer.encode("A sample test", add_special_tokens=True) + assert slow == [2, 235280, 6453, 2121, 204] + + self.tokenizer.add_eos_token = False + self.rust_tokenizer.add_eos_token = False + + def test_fast_merge_priority(self): + slow_tokenizer = self.tokenizer + fast_tokenizer = self.rust_tokenizer + text = " " + target = [168, 153] + slow = slow_tokenizer.encode(text, add_special_tokens=False) + assert slow == target + + fast = fast_tokenizer.encode(text, add_special_tokens=False) + assert fast == target + + @unittest.skip(reason="Not super important and always failing. Let's skip it") + @slow + def test_conversion(self): + # This is excruciatingly slow since it has to recreate the entire merge + # list from the original vocabulary in spm + self.rust_tokenizer.save_pretrained("./out") + with tempfile.TemporaryDirectory() as dirname: + self.rust_tokenizer.save_pretrained(dirname) + + with open(os.path.join(dirname, "tokenizer.json")) as f: + old_serialized = f.read() + + new_tokenizer = convert_slow_tokenizer(self.tokenizer) + with tempfile.NamedTemporaryFile() as f: + new_tokenizer.save(f.name) + # Re-opening since `f` is in bytes. + new_serialized = open(f.name).read() + with open("out_tokenizer.json", "w") as g: + g.write(new_serialized) + + self.assertEqual(old_serialized, new_serialized) + + def test_simple_encode_decode(self): + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + + self.tokenizer.add_eos_token = False + self.rust_tokenizer.add_eos_token = False + + self.assertEqual(pyth_tokenizer.encode("This is a test"), [2, 1596, 603, 476, 2121]) + self.assertEqual(rust_tokenizer.encode("This is a test"), [2, 1596, 603, 476, 2121]) + self.assertEqual(pyth_tokenizer.decode([2, 1596, 603, 476, 2121], skip_special_tokens=True), "This is a test") + self.assertEqual(rust_tokenizer.decode([2, 1596, 603, 476, 2121], skip_special_tokens=True), "This is a test") + + # bytefallback showcase + self.assertEqual(pyth_tokenizer.encode("生活的真谛是"), [2, 122182, 235710, 245467, 235427] ) # fmt: skip + self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [2, 122182, 235710, 245467, 235427] ) # fmt: skip + self.assertEqual( + pyth_tokenizer.decode([2, 122182, 235710, 245467, 235427], skip_special_tokens=True), + "生活的真谛是", + ) + self.assertEqual( + rust_tokenizer.decode([2, 122182, 235710, 245467, 235427], skip_special_tokens=True), + "生活的真谛是", + ) + + # Inner spaces showcase + self.assertEqual(pyth_tokenizer.encode("Hi Hello"), [2, 2151, 139, 4521]) + self.assertEqual(rust_tokenizer.encode("Hi Hello"), [2, 2151, 139, 4521]) + self.assertEqual(pyth_tokenizer.decode([2, 2151, 139, 4521], skip_special_tokens=True), "Hi Hello") + self.assertEqual(rust_tokenizer.decode([2, 2151, 139, 4521], skip_special_tokens=True), "Hi Hello") + + self.assertEqual(pyth_tokenizer.encode("Hi Hello"), [2, 2151, 140, 4521]) + self.assertEqual(rust_tokenizer.encode("Hi Hello"), [2, 2151, 140, 4521]) + self.assertEqual(pyth_tokenizer.decode([2, 2151, 140, 4521], skip_special_tokens=True), "Hi Hello") + self.assertEqual(rust_tokenizer.decode([2, 2151, 140, 4521], skip_special_tokens=True), "Hi Hello") + + self.assertEqual(pyth_tokenizer.encode(""), [2]) + self.assertEqual(rust_tokenizer.encode(""), [2]) + + self.assertEqual(pyth_tokenizer.encode(" "), [2, 235248]) + self.assertEqual(rust_tokenizer.encode(" "), [2, 235248]) + + self.assertEqual(pyth_tokenizer.encode(" "), [2, 139]) + self.assertEqual(rust_tokenizer.encode(" "), [2, 139]) + + self.assertEqual(pyth_tokenizer.encode(" Hello"), [2, 25957]) + self.assertEqual(rust_tokenizer.encode(" Hello"), [2, 25957]) + + def test_no_differences_decode(self): + self.tokenizer.add_eos_token = False + self.rust_tokenizer.add_eos_token = False + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + + self.assertEqual(pyth_tokenizer.decode([869]), "og") + self.assertEqual(rust_tokenizer.decode([869]), "og") + + self.assertEqual(pyth_tokenizer.decode([30112, 869]), " expenditureog") + self.assertEqual(rust_tokenizer.decode([30112, 869]), " expenditureog") + + def test_no_differences_special_tokens(self): + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + self.assertEqual(pyth_tokenizer.encode(""), [2]) + self.assertEqual(rust_tokenizer.encode(""), [2]) + + self.assertEqual(pyth_tokenizer.encode(""), [2, 204]) + self.assertEqual(rust_tokenizer.encode(""), [2, 204]) + + @unittest.skipIf( + os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0", + "RUN_TOKENIZER_INTEGRATION=1 to run tokenizer integration tests", + ) + def test_integration_test_xnli(self): + import tqdm + + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + + dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go") + for item in tqdm.tqdm(dataset["validation"]): + string = item["code"] + encoded1 = pyth_tokenizer.encode(string) + encoded2 = rust_tokenizer.encode(string) + + self.assertEqual( + encoded1, + encoded2, + msg="Hint: the following tokenization diff were obtained for slow vs fast:\n " + f"elements in slow: {set(pyth_tokenizer.tokenize(string)) - set(rust_tokenizer.tokenize(string))} \nvs\n " + f"elements in fast: {set(rust_tokenizer.tokenize(string)) - set(pyth_tokenizer.tokenize(string))} \n\n{string}", + ) + + decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True) + decoded2 = rust_tokenizer.decode(encoded1, skip_special_tokens=True) + + self.assertEqual(decoded1, decoded2) + + dataset = load_dataset("facebook/xnli", "all_languages") + + for item in tqdm.tqdm(dataset["train"]): + for string in item["premise"].values(): + encoded1 = pyth_tokenizer.encode(string) + encoded2 = rust_tokenizer.encode(string) + + self.assertEqual(encoded1, encoded2, msg=f"failed on {string}") + + decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True) + decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True) + + self.assertEqual(decoded1, decoded2) + + def test_special_token_special_word(self): + # the word inform should be split as ['in', 'form'] + tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma") + tokenizer.add_tokens([AddedToken("", rstrip=True, lstrip=True)], special_tokens=False) + out1 = tokenizer.decode( + tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False + ) + self.assertEqual(out1, "inform") + out2 = tokenizer.decode( + tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True + ) + # decoding strips the added prefix space. + self.assertEqual(out2, " inform") + input_ids = tokenizer.encode("inform", add_special_tokens=False) + self.assertEqual(input_ids, [256000, 43910]) + + out2 = tokenizer.decode( + tokenizer.encode(" inform", add_special_tokens=False), spaces_between_special_tokens=False + ) + # TODO @ArthurZ currently we strip left and right, so this will not keep the spaces + self.assertEqual(out2, "inform") + + ### Let's make sure decoding does not add extra spaces here and there + # TODO @ArthurZ this should be affected by the lstrip/rstrip/single word /normalize refactoring + # Since currently we always strip left and right of the token, results are as such + input_ids = tokenizer.encode(" Hellohow", add_special_tokens=False) + self.assertEqual(input_ids, [204, 25957, 204, 1139]) + tokens = tokenizer.tokenize(" Hellohow", add_special_tokens=False) + self.assertEqual(tokens, ["", "▁Hello", "", "how"]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, " Hellohow") + + # Let's make sure that if there are any spaces, we don't remove them! + input_ids = tokenizer.encode(" Hello how", add_special_tokens=False) + self.assertEqual(input_ids, [235248, 204, 25957, 204, 1368]) + tokens = tokenizer.tokenize(" Hello how", add_special_tokens=False) + self.assertEqual(tokens, ["▁", "", "▁Hello", "", "▁how"]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, " Hello how") + + def test_some_edge_cases(self): + tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma") + + sp_tokens = tokenizer.sp_model.encode(">", out_type=str) + self.assertEqual(sp_tokens, ["", ">"]) + tokens = tokenizer.tokenize(">") + self.assertEqual(sp_tokens, tokens) + self.assertEqual(tokens, ["", ">"]) + + tokens = tokenizer.tokenize("") + self.assertEqual(tokens, []) + self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str)) + + tokens = tokenizer.tokenize(" ") + self.assertEqual(tokens, ["▁"]) + # a dummy prefix space is not added by the sp_model as it was de-activated + self.assertEqual(tokens, tokenizer.sp_model.encode(" ", out_type=str)) + + tokens = tokenizer.tokenize("▁") + self.assertEqual(tokens, ["▁"]) + # a dummy prefix space is not added by the sp_model as it was de-activated + self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str)) + + tokens = tokenizer.tokenize(" ▁") + self.assertEqual(tokens, ["▁▁"]) + # a dummy prefix space is not added by the sp_model as it was de-activated + self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str)) + + @require_jinja + def test_tokenization_for_chat(self): + tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma") + + test_chats = [ + [{"role": "user", "content": "Hello!"}], + [ + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Nice to meet you."}, + ], + [{"role": "user", "content": "Hello!"}], + ] + # Matt: The third test case tests the default system message, but if this is ever changed in the + # class/repo code then that test will fail, and the case will need to be updated. + tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] + expected_tokens = [[235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108], [235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108, 235322, 235371, 571, 235298, 2997, 73786, 105776, 108, 7731, 577, 4664, 692, 35606, 235371, 571, 235298, 615, 73786, 108], [235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108]] # fmt: skip + for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): + self.assertListEqual(tokenized_chat, expected_tokens) + + def test_save_fast_load_slow(self): + # Ensure that we can save a fast tokenizer and load it as a slow tokenizer + slow_tokenizer = self.tokenizer + text = "a " + target_encoded = [2, 235250, 139] + slow = slow_tokenizer.encode(text, add_special_tokens=True) + assert slow == target_encoded + + slow_decoded = slow_tokenizer.decode(slow, skip_special_tokens=True) + assert slow_decoded == text + + with tempfile.TemporaryDirectory() as dirname: + # Save fast tokenizer + self.rust_tokenizer.save_pretrained(dirname) + + # Load slow tokenizer with fast files present in the directory + slow_tokenizer_from_fast = GemmaTokenizer.from_pretrained(dirname) + + slow_from_fast = slow_tokenizer_from_fast.encode(text, add_special_tokens=True) + assert slow_from_fast == target_encoded + + slow_from_fast_decoded = slow_tokenizer_from_fast.decode(slow, skip_special_tokens=True) + assert slow_from_fast_decoded == text + + +@require_sentencepiece +@require_tokenizers +class CommonSpmIntegrationTests(unittest.TestCase): + """ + A class that regroups important test to make sure that we properly handle the special tokens. + """ + + def test_edge_case_tabulation(self): + fast_tokenizer = GemmaTokenizerFast.from_pretrained("hf-internal-testing/dummy-gemma") + slow_tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma") + input_text = "Hey. \t\t \n\nyou é @#😈 🤗! , 1234 15 5,61" + EXPECTED_IDS = [ 2, 6750, 1, 235265, 235248, 255969, 235248, 109, 4747, 139, 235335, 139, 216311, 241316, 139, 239880, 235341, 144, 235269, 235248, 235274, 235284, 235304, 235310, 235248, 235274, 235308, 235248, 235308, 235269, 235318, 235274] # fmt: skip + EXPECTED_TOKENS = [ "Hey", "", ".", "▁", "\t\t", "▁", "\n\n", "you", "▁▁", "é", "▁▁", "@#", "😈", "▁▁", "🤗", "!", "▁▁▁▁▁▁▁", ",", "▁", "1", "2", "3", "4", "▁", "1", "5", "▁", "5", ",", "6", "1"] # fmt: skip + + tokens = fast_tokenizer.tokenize(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(tokens, EXPECTED_TOKENS) + + tokens = slow_tokenizer.tokenize(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(tokens, EXPECTED_TOKENS) + + input_ids = fast_tokenizer.encode(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(input_ids, EXPECTED_IDS) + + input_ids = slow_tokenizer.encode(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(input_ids, EXPECTED_IDS) + + text = fast_tokenizer.decode(EXPECTED_IDS) + with self.subTest("test fast edge case fast"): + self.assertEqual(text, "Hey. \t\t \n\nyou é @#😈 🤗! , 1234 15 5,61") + + text = slow_tokenizer.decode(EXPECTED_IDS) + with self.subTest("test fast edge case fast"): + self.assertEqual(text, "Hey. \t\t \n\nyou é @#😈 🤗! , 1234 15 5,61") + + input_text = "\t\t\t\t \n\n61" + EXPECTED_IDS = [2, 255971, 235248, 109, 235318, 235274] + EXPECTED_TOKENS = ["\t\t\t\t", "▁", "\n\n", "6", "1"] + + tokens = fast_tokenizer.tokenize(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(tokens, EXPECTED_TOKENS) + + tokens = slow_tokenizer.tokenize(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(tokens, EXPECTED_TOKENS) + + input_ids = fast_tokenizer.encode(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(input_ids, EXPECTED_IDS) + + input_ids = slow_tokenizer.encode(input_text) + with self.subTest("test fast edge case fast"): + self.assertEqual(input_ids, EXPECTED_IDS) + + text = fast_tokenizer.decode(EXPECTED_IDS) + with self.subTest("test fast edge case fast"): + self.assertEqual(text, "\t\t\t\t \n\n61") + + text = slow_tokenizer.decode(EXPECTED_IDS) + with self.subTest("test fast edge case fast"): + self.assertEqual(text, "\t\t\t\t \n\n61") diff --git a/docs/transformers/tests/models/gemma2/__init__.py b/docs/transformers/tests/models/gemma2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/gemma2/test_modeling_gemma2.py b/docs/transformers/tests/models/gemma2/test_modeling_gemma2.py new file mode 100644 index 0000000000000000000000000000000000000000..d1ba0cbec4e6726f857fde8492388c03d35b42f8 --- /dev/null +++ b/docs/transformers/tests/models/gemma2/test_modeling_gemma2.py @@ -0,0 +1,396 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Gemma2 model.""" + +import unittest + +import pytest +from packaging import version +from parameterized import parameterized +from pytest import mark + +from transformers import AutoModelForCausalLM, AutoTokenizer, Gemma2Config, is_torch_available, pipeline +from transformers.generation.configuration_utils import GenerationConfig +from transformers.testing_utils import ( + require_flash_attn, + require_read_token, + require_torch, + require_torch_accelerator, + require_torch_gpu, + slow, + tooslow, + torch_device, +) + +from ...models.gemma.test_modeling_gemma import GemmaModelTest, GemmaModelTester +from ...test_configuration_common import ConfigTester + + +if is_torch_available(): + import torch + + from transformers import ( + Gemma2ForCausalLM, + Gemma2ForSequenceClassification, + Gemma2ForTokenClassification, + Gemma2Model, + ) + + +class Gemma2ModelTester(GemmaModelTester): + if is_torch_available(): + config_class = Gemma2Config + model_class = Gemma2Model + for_causal_lm_class = Gemma2ForCausalLM + for_sequence_class = Gemma2ForSequenceClassification + for_token_class = Gemma2ForTokenClassification + + +@require_torch +class Gemma2ModelTest(GemmaModelTest, unittest.TestCase): + all_model_classes = ( + (Gemma2Model, Gemma2ForCausalLM, Gemma2ForSequenceClassification, Gemma2ForTokenClassification) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": Gemma2Model, + "text-classification": Gemma2ForSequenceClassification, + "token-classification": Gemma2ForTokenClassification, + "text-generation": Gemma2ForCausalLM, + "zero-shot": Gemma2ForSequenceClassification, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + _is_stateful = True + model_split_percents = [0.5, 0.6] + + def setUp(self): + self.model_tester = Gemma2ModelTester(self) + self.config_tester = ConfigTester(self, config_class=Gemma2Config, hidden_size=37) + + @unittest.skip("Failing because of unique cache (HybridCache)") + def test_model_outputs_equivalence(self, **kwargs): + pass + + @unittest.skip("Gemma2's forcefully disables sdpa due to softcapping") + def test_sdpa_can_dispatch_non_composite_models(self): + pass + + @unittest.skip("Gemma2's eager attn/sdpa attn outputs are expected to be different") + def test_eager_matches_sdpa_generate(self): + pass + + @parameterized.expand([("random",), ("same",)]) + @pytest.mark.generate + @unittest.skip("Gemma2 has HybridCache which is not compatible with assisted decoding") + def test_assisted_decoding_matches_greedy_search(self, assistant_type): + pass + + @unittest.skip("Gemma2 has HybridCache which is not compatible with assisted decoding") + def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type): + pass + + @pytest.mark.generate + @unittest.skip("Gemma2 has HybridCache which is not compatible with assisted decoding") + def test_assisted_decoding_sample(self): + pass + + @unittest.skip("Gemma2 has HybridCache which is not compatible with dola decoding") + def test_dola_decoding_sample(self): + pass + + @unittest.skip("Gemma2 has HybridCache and doesn't support continue from past kv") + def test_generate_continue_from_past_key_values(self): + pass + + @unittest.skip("Gemma2 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate(self): + pass + + @unittest.skip("Gemma2 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip("Gemma2 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip("Gemma2 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") + def test_generate_with_static_cache(self): + pass + + @unittest.skip("Gemma2 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") + def test_generate_from_inputs_embeds_with_static_cache(self): + pass + + @unittest.skip("Gemma2 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") + def test_generate_continue_from_inputs_embeds(self): + pass + + @unittest.skip("Gemma2's eager attn/sdpa attn outputs are expected to be different") + def test_sdpa_equivalence(self): + pass + + @unittest.skip( + reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`" + " as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting" + ) + def test_multi_gpu_data_parallel_forward(self): + pass + + @unittest.skip("Gemma2 has HybridCache which auto-compiles. Compile and FA2 don't work together.") + def test_eager_matches_fa2_generate(self): + pass + + +@slow +@require_torch_accelerator +class Gemma2IntegrationTest(unittest.TestCase): + input_text = ["Hello I am doing", "Hi today"] + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + @tooslow + @require_read_token + def test_model_9b_bf16(self): + model_id = "google/gemma-2-9b" + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many", + "Hi today I'm going to be talking about the history of the United States. The United States of America", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager" + ).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=False) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @tooslow + @require_read_token + def test_model_9b_fp16(self): + model_id = "google/gemma-2-9b" + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many", + "Hi today I'm going to be talking about the history of the United States. The United States of America", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, attn_implementation="eager" + ).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=False) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + @tooslow + def test_model_9b_pipeline_bf16(self): + # See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Gemma2 before this PR + model_id = "google/gemma-2-9b" + # EXPECTED_TEXTS should match the same non-pipeline test, minus the special tokens + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many", + "Hi today I'm going to be talking about the history of the United States. The United States of America", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" + ).to(torch_device) + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + + output = pipe(self.input_text, max_new_tokens=20, do_sample=False, padding=True) + + self.assertEqual(output[0][0]["generated_text"], EXPECTED_TEXTS[0]) + self.assertEqual(output[1][0]["generated_text"], EXPECTED_TEXTS[1]) + + @require_read_token + def test_model_2b_pipeline_bf16_flex_attention(self): + # See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Gemma2 before this PR + model_id = "google/gemma-2-2b" + # EXPECTED_TEXTS should match the same non-pipeline test, minus the special tokens + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1960s and I am trying to find out what the average", + "Hi today I'm going to be talking about the 10 best anime of all time.\n\n1", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" + ).to(torch_device) + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + + output = pipe(self.input_text, max_new_tokens=20, do_sample=False, padding=True) + + self.assertEqual(output[0][0]["generated_text"], EXPECTED_TEXTS[0]) + self.assertEqual(output[1][0]["generated_text"], EXPECTED_TEXTS[1]) + + @require_read_token + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + @tooslow + def test_model_9b_flash_attn(self): + # See https://github.com/huggingface/transformers/issues/31953 --- flash attn was generating garbage for gemma2, especially in long context + model_id = "google/gemma-2-9b" + EXPECTED_TEXTS = [ + 'Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many people died in the United States. I have found a few sites that say 500,000 but I am not sure if that is correct. I have also found a site that says 675,000 but I am not sure if that is correct either. I am trying to find out how many people died in the United States. I have found a few', + "Hi today I'm going to be talking about the history of the United States. The United States of America is a country in North America. It is the third largest country in the world by total area and the third most populous country with over 320 million people. The United States is a federal republic consisting of 50 states and a federal district. The 48 contiguous states and the district of Columbia are in central North America between Canada and Mexico. The state of Alaska is in the" + ] # fmt: skip + + model = AutoModelForCausalLM.from_pretrained( + model_id, attn_implementation="flash_attention_2", torch_dtype="float16" + ).to(torch_device) + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=100, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=False) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @slow + @require_read_token + def test_export_static_cache(self): + if version.parse(torch.__version__) < version.parse("2.5.0"): + self.skipTest(reason="This test requires torch >= 2.5 to run.") + + from transformers.integrations.executorch import ( + TorchExportableModuleWithStaticCache, + convert_and_export_with_cache, + ) + + tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b", pad_token="", padding_side="right") + EXPECTED_TEXT_COMPLETION = [ + "Hello I am doing a project for my school and I need to know how to make a program that will take a number", + ] + max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[ + "input_ids" + ].shape[-1] + + # Load model + device = "cpu" + dtype = torch.bfloat16 + cache_implementation = "static" + attn_implementation = "sdpa" + batch_size = 1 + model = AutoModelForCausalLM.from_pretrained( + "google/gemma-2-2b", + device_map=device, + torch_dtype=dtype, + attn_implementation=attn_implementation, + generation_config=GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=max_generation_length, + cache_config={ + "batch_size": batch_size, + "max_cache_len": max_generation_length, + }, + ), + ) + + prompts = ["Hello I am doing"] + prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + prompt_token_ids = prompt_tokens["input_ids"] + max_new_tokens = max_generation_length - prompt_token_ids.shape[-1] + + # Static Cache + export + exported_program = convert_and_export_with_cache(model) + ep_generated_ids = TorchExportableModuleWithStaticCache.generate( + exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens + ) + ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text) + + @require_read_token + @tooslow + def test_model_9b_bf16_flex_attention(self): + model_id = "google/gemma-2-9b" + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many", + "Hi today I'm going to be talking about the history of the United States. The United States of America", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flex_attention" + ).to(torch_device) + assert model.config._attn_implementation == "flex_attention" + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=False) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @parameterized.expand([("flash_attention_2",), ("sdpa",), ("flex_attention",), ("eager",)]) + @require_read_token + def test_generation_beyond_sliding_window(self, attn_implementation: str): + """Test that we can correctly generate beyond the sliding window. This is non trivial as + we need to correctly slice the attention mask in all cases (because we use a HybridCache). + Outputs for every attention functions should be coherent and identical. + """ + + if torch_device == "xpu" and attn_implementation == "flash_attention_2": + self.skipTest(reason="Intel XPU doesn't support falsh_attention_2 as of now.") + + model_id = "google/gemma-2-2b" + EXPECTED_COMPLETIONS = [ + " the people, the food, the culture, the history, the music, the art, the architecture", + ", green, yellow, orange, purple, pink, brown, black, white, gray, silver", + ] + + input_text = [ + "This is a nice place. " * 800 + "I really enjoy the scenery,", # This is larger than 4096 tokens + "A list of colors: red, blue", # This will almost all be padding tokens + ] + tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left") + inputs = tokenizer(input_text, padding=True, return_tensors="pt").to(torch_device) + + model = AutoModelForCausalLM.from_pretrained( + model_id, attn_implementation=attn_implementation, torch_dtype=torch.float16 + ).to(torch_device) + + # Make sure prefill is larger than sliding window + input_size = inputs.input_ids.shape[-1] + self.assertTrue(input_size > model.config.sliding_window) + + out = model.generate(**inputs, max_new_tokens=20)[:, input_size:] + output_text = tokenizer.batch_decode(out) + + self.assertEqual(output_text, EXPECTED_COMPLETIONS) diff --git a/docs/transformers/tests/models/gemma3/__init__.py b/docs/transformers/tests/models/gemma3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/gemma3/test_image_processing_gemma3.py b/docs/transformers/tests/models/gemma3/test_image_processing_gemma3.py new file mode 100644 index 0000000000000000000000000000000000000000..a32e475da0b37ca42ec7d995df0b8240ab7bfb85 --- /dev/null +++ b/docs/transformers/tests/models/gemma3/test_image_processing_gemma3.py @@ -0,0 +1,292 @@ +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import Gemma3ImageProcessor + + if is_torchvision_available(): + from transformers import Gemma3ImageProcessorFast + + +class Gemma3ImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=IMAGENET_STANDARD_MEAN, + image_std=IMAGENET_STANDARD_STD, + do_convert_rgb=True, + do_pan_and_scan=True, + pan_and_scan_min_crop_size=10, + pan_and_scan_max_num_crops=2, + pan_and_scan_min_ratio_to_activate=1.2, + ): + super().__init__() + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + self.do_pan_and_scan = do_pan_and_scan + self.pan_and_scan_min_crop_size = pan_and_scan_min_crop_size + self.pan_and_scan_max_num_crops = pan_and_scan_max_num_crops + self.pan_and_scan_min_ratio_to_activate = pan_and_scan_min_ratio_to_activate + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + "do_pan_and_scan": self.do_pan_and_scan, + "pan_and_scan_min_crop_size": self.pan_and_scan_min_crop_size, + "pan_and_scan_max_num_crops": self.pan_and_scan_max_num_crops, + "pan_and_scan_min_ratio_to_activate": self.pan_and_scan_min_ratio_to_activate, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["height"], self.size["width"] + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class Gemma3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = Gemma3ImageProcessor if is_vision_available() else None + fast_image_processing_class = Gemma3ImageProcessorFast if is_torchvision_available() else None + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->Gemma3 + def setUp(self): + super().setUp() + self.image_processor_tester = Gemma3ImageProcessingTester(self) + + @property + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + self.assertTrue(hasattr(image_processing, "do_pan_and_scan")) + self.assertTrue(hasattr(image_processing, "pan_and_scan_min_crop_size")) + self.assertTrue(hasattr(image_processing, "pan_and_scan_max_num_crops")) + self.assertTrue(hasattr(image_processing, "pan_and_scan_min_ratio_to_activate")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=84) + self.assertEqual(image_processor.size, {"height": 84, "width": 84}) + + def test_without_pan_and_scan(self): + """ + Disable do_pan_and_scan parameter. + """ + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processor = image_processing_class.from_dict(self.image_processor_dict, do_pan_and_scan=False) + + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processor(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = (1, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched + encoded_images = image_processor(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + def test_pan_and_scan(self): + """ + Enables Pan and Scan path by choosing the correct input image resolution. If you are changing + image processor attributes for PaS, please update this test. + """ + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random numpy tensors + """This function prepares a list of PIL images""" + image_inputs = [np.random.randint(255, size=(3, 300, 600), dtype=np.uint8)] * 3 + image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] + + # Test not batched input, 3 images because we have base image + 2 crops + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = (3, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched, 9 images because we have base image + 2 crops per each item + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (9, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched unbalanced, 9 images because we have base image + 2 crops per each item + encoded_images = image_processing( + [[image_inputs[0], image_inputs[1]], [image_inputs[2]]], return_tensors="pt" + ).pixel_values + expected_output_image_shape = (9, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + def test_call_pil(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = (1, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + def test_call_numpy(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = (1, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + def test_call_pytorch(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True) + + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = (1, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + @unittest.skip("Gemma3 doesn't work with 4 channels due to pan and scan method") + def test_call_numpy_4_channels(self): + pass + + @require_vision + @require_torch + def test_slow_fast_equivalence_batched_pas(self): + if not self.test_slow_image_processor or not self.test_fast_image_processor: + self.skipTest(reason="Skipping slow/fast equivalence test") + + if self.image_processing_class is None or self.fast_image_processing_class is None: + self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined") + + if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop: + self.skipTest( + reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors" + ) + crop_config = { + "do_pan_and_scan": True, + "pan_and_scan_max_num_crops": 448, + "pan_and_scan_min_crop_size": 32, + "pan_and_scan_min_ratio_to_activate": 0.3, + } + image_processor_dict = self.image_processor_dict + image_processor_dict.update(crop_config) + dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + image_processor_slow = self.image_processing_class(**image_processor_dict) + image_processor_fast = self.fast_image_processing_class(**image_processor_dict) + + encoding_slow = image_processor_slow(dummy_images, return_tensors="pt") + encoding_fast = image_processor_fast(dummy_images, return_tensors="pt") + + torch.testing.assert_close(encoding_slow.num_crops, encoding_fast.num_crops) + self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1)) + self.assertLessEqual( + torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3 + ) diff --git a/docs/transformers/tests/models/gemma3/test_modeling_gemma3.py b/docs/transformers/tests/models/gemma3/test_modeling_gemma3.py new file mode 100644 index 0000000000000000000000000000000000000000..be83749cf8bc7a1aaf4563d501dcf705ae94f301 --- /dev/null +++ b/docs/transformers/tests/models/gemma3/test_modeling_gemma3.py @@ -0,0 +1,666 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Gemma3 model.""" + +import tempfile +import unittest + +import pytest +from parameterized import parameterized + +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + Gemma3Config, + Gemma3TextConfig, + GenerationConfig, + is_torch_available, +) +from transformers.testing_utils import ( + cleanup, + require_flash_attn, + require_read_token, + require_torch, + require_torch_gpu, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...models.gemma.test_modeling_gemma import GemmaModelTester +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor + + +if is_torch_available(): + import torch + + from transformers import ( + Gemma3ForCausalLM, + Gemma3ForConditionalGeneration, + Gemma3Processor, + Gemma3TextModel, + ) + + +class Gemma3ModelTester(GemmaModelTester): + if is_torch_available(): + config_class = Gemma3TextConfig + model_class = Gemma3TextModel + for_causal_lm_class = Gemma3ForCausalLM + + +@require_torch +class Gemma3ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = (Gemma3TextModel, Gemma3ForCausalLM) if is_torch_available() else () + all_generative_model_classes = (Gemma3ForCausalLM,) if is_torch_available() else () + test_headmasking = False + test_pruning = False + _is_stateful = True + model_split_percents = [0.5, 0.6] + + def setUp(self): + self.model_tester = Gemma3ModelTester(self) + self.config_tester = ConfigTester(self, config_class=Gemma3Config, hidden_size=37) + + @unittest.skip("Failing because of unique cache (HybridCache)") + def test_model_outputs_equivalence(self, **kwargs): + pass + + @parameterized.expand([("random",), ("same",)]) + @pytest.mark.generate + @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding") + def test_assisted_decoding_matches_greedy_search(self, assistant_type): + pass + + @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding") + def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type): + pass + + @pytest.mark.generate + @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding") + def test_assisted_decoding_sample(self): + pass + + @unittest.skip("Gemma3 has HybridCache which is not compatible with dola decoding") + def test_dola_decoding_sample(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support continue from past kv") + def test_generate_continue_from_past_key_values(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support low_memory generation") + def test_beam_search_low_memory(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") + def test_generate_with_static_cache(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") + def test_generate_from_inputs_embeds_with_static_cache(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") + def test_generate_continue_from_inputs_embeds(self): + pass + + @unittest.skip("Gemma3 has HybridCache which auto-compiles. Compile and FA2 don't work together.") + def test_eager_matches_fa2_generate(self): + pass + + @unittest.skip( + reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`" + " as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting" + ) + def test_multi_gpu_data_parallel_forward(self): + pass + + +class Gemma3Vision2TextModelTester: + def __init__( + self, + parent, + mm_tokens_per_image=2, + image_token_index=1, + boi_token_index=2, + eoi_token_index=3, + seq_length=25, + is_training=True, + vision_config={ + "use_labels": True, + "image_size": 20, + "patch_size": 5, + "num_channels": 3, + "is_training": True, + "hidden_size": 32, + "num_key_value_heads": 1, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "dropout": 0.1, + "attention_dropout": 0.1, + "initializer_range": 0.02, + }, + use_cache=False, + ): + self.parent = parent + # `image_token_index` is set to 0 to pass "resize_embeddings" test, do not modify + self.mm_tokens_per_image = mm_tokens_per_image + self.image_token_index = image_token_index + self.boi_token_index = boi_token_index + self.eoi_token_index = eoi_token_index + self.llm_tester = Gemma3ModelTester(self.parent) + self.text_config = self.llm_tester.get_config() + self.vision_config = vision_config + self.seq_length = seq_length + self.pad_token_id = self.text_config.pad_token_id + + self.num_hidden_layers = self.text_config.num_hidden_layers + self.vocab_size = self.text_config.vocab_size + self.hidden_size = self.text_config.hidden_size + self.num_attention_heads = self.text_config.num_attention_heads + self.is_training = is_training + + self.batch_size = 3 + self.num_channels = vision_config["num_channels"] + self.image_size = vision_config["image_size"] + self.encoder_seq_length = seq_length + self.use_cache = use_cache + + def get_config(self): + return Gemma3Config( + text_config=self.text_config, + vision_config=self.vision_config, + image_token_index=self.image_token_index, + boi_token_index=self.boi_token_index, + eoi_token_index=self.eoi_token_index, + mm_tokens_per_image=self.mm_tokens_per_image, + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + self.vision_config["num_channels"], + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + config = self.get_config() + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 + attention_mask = input_ids.ne(self.pad_token_id).to(torch_device) + + # set the 3 first tokens to be image, and ensure that no other tokens are image tokens + # do not change this unless you modified image size or patch size + input_ids[input_ids == config.image_token_index] = self.pad_token_id + input_ids[:, :1] = config.image_token_index + + token_type_ids = torch.zeros_like(input_ids) + token_type_ids[input_ids == config.image_token_index] = 1 + + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + return config, inputs_dict + + +@require_torch +class Gemma3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = (Gemma3ForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (Gemma3ForConditionalGeneration,) if is_torch_available() else () + test_headmasking = False + test_pruning = False + test_missing_keys = False + _is_stateful = True + model_split_percents = [0.5, 0.6] + + # MP works but offload doesn't work when the SigLIP MultiheadAttention is offloaded + # TODO: One potential solution would be to add to set preload_module_classes = ["SiglipMultiheadAttentionPoolingHead"] + # in the dispatch_model function + test_cpu_offload = False + test_disk_offload_safetensors = False + test_disk_offload_bin = False + + def setUp(self): + self.model_tester = Gemma3Vision2TextModelTester(self) + self.config_tester = ConfigTester(self, config_class=Gemma3Config, hidden_size=37) + + @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip( + reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`" + " as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting" + ) + def test_multi_gpu_data_parallel_forward(self): + pass + + @unittest.skip("Failing because of unique cache (HybridCache)") + def test_model_outputs_equivalence(self, **kwargs): + pass + + @parameterized.expand([("random",), ("same",)]) + @pytest.mark.generate + @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding") + def test_assisted_decoding_matches_greedy_search(self, assistant_type): + pass + + @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding") + def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type): + pass + + @pytest.mark.generate + @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding") + def test_assisted_decoding_sample(self): + pass + + @unittest.skip("Gemma3 has HybridCache which is not compatible with dola decoding") + def test_dola_decoding_sample(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support continue from past kv") + def test_generate_continue_from_past_key_values(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support low_memory generation") + def test_beam_search_low_memory(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") + def test_generate_with_static_cache(self): + pass + + @unittest.skip("Gemma3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.") + def test_generate_from_inputs_embeds_with_static_cache(self): + pass + + @unittest.skip("Gemma3 has HybridCache which auto-compiles. Compile and FA2 don't work together.") + def test_eager_matches_fa2_generate(self): + pass + + @unittest.skip( + reason="Siglip (vision backbone) uses the same initialization scheme as the Flax original implementation" + ) + def test_initialization(self): + pass + + @unittest.skip( + reason="Siglip has no FLEX attention, and we don't have a proper way to set/test attn in VLMs. TODO @raushan" + ) + def test_flex_attention_with_grads(self): + pass + + def test_automodelforcausallm(self): + """ + Regression test for #36741/#36917 -- make sure `AutoModelForCausalLM` works with a Gemma3 config, i.e. that + `AutoModelForCausalLM.from_pretrained` pulls the text config before loading the model + """ + config = self.model_tester.get_config() + model = Gemma3ForConditionalGeneration(config) + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + for_causal_lm = AutoModelForCausalLM.from_pretrained(tmp_dir) + self.assertIsInstance(for_causal_lm, Gemma3ForConditionalGeneration) + + +@slow +@require_torch_gpu +@require_read_token +class Gemma3IntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = Gemma3Processor.from_pretrained("google/gemma-3-4b-it", padding_side="left") + + url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png" + self.messages = [ + {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, + { + "role": "user", + "content": [ + {"type": "image", "url": url}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, + ] + + def tearDown(self): + cleanup(torch_device, gc_collect=True) + + def test_model_4b_bf16(self): + model_id = "google/gemma-3-4b-it" + + model = Gemma3ForConditionalGeneration.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 + ).to(torch_device) + + inputs = self.processor.apply_chat_template( + self.messages, + tokenize=True, + return_dict=True, + return_tensors="pt", + add_generation_prompt=True, + ).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=30, do_sample=False) + output_text = self.processor.batch_decode(output, skip_special_tokens=True) + + EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like'] # fmt: skip + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_4b_batch(self): + model_id = "google/gemma-3-4b-it" + + model = Gemma3ForConditionalGeneration.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 + ).to(torch_device) + + messages_2 = [ + {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png", + }, + {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}, + {"type": "text", "text": "Are these images identical?"}, + ], + }, + ] + + inputs = self.processor.apply_chat_template( + [self.messages, messages_2], + tokenize=True, + return_dict=True, + return_tensors="pt", + padding=True, + add_generation_prompt=True, + ).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=30, do_sample=False) + output_text = self.processor.batch_decode(output, skip_special_tokens=True) + + EXPECTED_TEXTS = [ + 'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like', + "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Image 1:** Shows a cow" + ] # fmt: skip + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_4b_crops(self): + model_id = "google/gemma-3-4b-it" + + model = Gemma3ForConditionalGeneration.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 + ).to(torch_device) + + crop_config = { + "images_kwargs": { + "do_pan_and_scan": True, + "pan_and_scan_max_num_crops": 448, + "pan_and_scan_min_crop_size": 32, + "pan_and_scan_min_ratio_to_activate": 0.3, + } + } + + inputs = self.processor.apply_chat_template( + self.messages, + tokenize=True, + return_dict=True, + return_tensors="pt", + add_generation_prompt=True, + **crop_config, + ).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=30, do_sample=False) + output_text = self.processor.batch_decode(output, skip_special_tokens=True) + + EXPECTED_NUM_IMAGES = 3 # one for the origin image and two crops of images + EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a beach with a turquoise ocean and blue sky in the background. It looks like the cow is enjoying the beach'] # fmt: skip + self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES) + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_4b_batch_crops(self): + model_id = "google/gemma-3-4b-it" + + model = Gemma3ForConditionalGeneration.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 + ).to(torch_device) + crop_config = { + "images_kwargs": { + "do_pan_and_scan": True, + "pan_and_scan_max_num_crops": 448, + "pan_and_scan_min_crop_size": 32, + "pan_and_scan_min_ratio_to_activate": 0.3, + } + } + messages_2 = [ + {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png", + }, + {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}, + {"type": "text", "text": "Are these images identical?"}, + ], + }, + ] + + inputs = self.processor.apply_chat_template( + [self.messages, messages_2], + tokenize=True, + return_dict=True, + return_tensors="pt", + padding=True, + add_generation_prompt=True, + **crop_config, + ).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=30, do_sample=False) + output_text = self.processor.batch_decode(output, skip_special_tokens=True) + EXPECTED_NUM_IMAGES = 9 # 3 * (one for the origin image and two crops of images) = 9 + EXPECTED_TEXTS = [ + "user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a beach with a turquoise ocean and blue sky in the background. It looks like the cow is enjoying the beach", + "user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nWhile they all feature a brown cow in the foreground and a similar background (including the stop signs and", + ] # fmt: skip + self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES) + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_4b_multiimage(self): + model_id = "google/gemma-3-4b-it" + + model = Gemma3ForConditionalGeneration.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 + ).to(torch_device) + + messages = [ + {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, + { + "role": "user", + "content": [ + {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}, + {"type": "text", "text": "What do you see here?"}, + ], + }, + ] + + inputs = self.processor.apply_chat_template( + messages, + tokenize=True, + return_dict=True, + return_tensors="pt", + padding=True, + add_generation_prompt=True, + ).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=30, do_sample=False) + output_text = self.processor.batch_decode(output, skip_special_tokens=True) + + EXPECTED_TEXTS = ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a vibrant,"] # fmt: skip + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_1b_text_only(self): + model_id = "google/gemma-3-1b-it" + + model = Gemma3ForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( + torch_device + ) + tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left") + inputs = tokenizer("Write a poem about Machine Learning.", return_tensors="pt").to(torch_device) + + output = model.generate(**inputs, max_new_tokens=30, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + EXPECTED_TEXTS = ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a river deep,\nWith patterns hidden, secrets sleep.\nA neural net, a watchful eye,\nLearning'] # fmt: skip + self.assertEqual(output_text, EXPECTED_TEXTS) + + # TODO: raushan FA2 generates gibberish for no reason, check later + @require_flash_attn + @require_torch_gpu + @pytest.mark.flash_attn_test + def test_model_4b_flash_attn(self): + model_id = "google/gemma-3-4b-it" + + model = Gemma3ForConditionalGeneration.from_pretrained( + model_id, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ).to(torch_device) + + inputs = self.processor.apply_chat_template( + self.messages, + tokenize=True, + return_dict=True, + return_tensors="pt", + add_generation_prompt=True, + ).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=30, do_sample=False) + output_text = self.processor.batch_decode(output, skip_special_tokens=True) + + EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and'] # fmt: skip + self.assertEqual(output_text, EXPECTED_TEXTS) + + @parameterized.expand([("flash_attention_2",), ("sdpa",), ("eager",)]) + def test_generation_beyond_sliding_window(self, attn_implementation: str): + """Test that we can correctly generate beyond the sliding window. This is non trivial as + we need to correctly slice the attention mask in all cases (because we use a HybridCache). + Outputs for every attention functions should be coherent and identical. + """ + model_id = "google/gemma-3-1b-it" + + input_text = [ + "This is a nice place. " * 800 + "I really enjoy the scenery,", # This is larger than 4096 tokens + "A list of colors: red, blue", # This will almost all be padding tokens + ] + tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left") + inputs = tokenizer(input_text, padding=True, return_tensors="pt").to(torch_device) + + model = AutoModelForCausalLM.from_pretrained( + model_id, attn_implementation=attn_implementation, torch_dtype=torch.float16 + ).to(torch_device) + + # Make sure prefill is larger than sliding window + input_size = inputs.input_ids.shape[-1] + self.assertTrue(input_size > model.config.sliding_window) + + out = model.generate(**inputs, max_new_tokens=20, do_sample=False)[:, input_size:] + output_text = tokenizer.batch_decode(out) + + EXPECTED_COMPLETIONS = [" and I'm going to take a walk.\n\nI really enjoy the scenery, and I'", ", green, yellow, orange, purple, brown, black, white, gray.\n\nI'"] # fmt: skip + self.assertEqual(output_text, EXPECTED_COMPLETIONS) + + def test_generation_beyond_sliding_window_with_generation_config(self): + """ + Similar to `test_generation_beyond_sliding_window`, but passing a GenerationConfig. Regression test for #36684 + -- ensures `cache_implementation='hybrid'` is correctly inherited from the base `model.generation_config`. + """ + model_id = "google/gemma-3-1b-it" + attn_implementation = "sdpa" + + input_text = [ + "This is a nice place. " * 800 + "I really enjoy the scenery,", # This is larger than 4096 tokens + "A list of colors: red, blue", # This will almost all be padding tokens + ] + tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left") + inputs = tokenizer(input_text, padding=True, return_tensors="pt").to(torch_device) + + model = AutoModelForCausalLM.from_pretrained( + model_id, attn_implementation=attn_implementation, torch_dtype=torch.float16 + ).to(torch_device) + + # Make sure prefill is larger than sliding window + input_size = inputs.input_ids.shape[-1] + self.assertGreater(input_size, model.config.sliding_window) + + generation_config = GenerationConfig(max_new_tokens=5, min_new_tokens=5) + out = model.generate(**inputs, generation_config=generation_config) + + out = model.generate(**inputs, generation_config=generation_config, do_sample=False)[:, input_size:] + output_text = tokenizer.batch_decode(out) + EXPECTED_COMPLETIONS = [" and I'm going to take a walk.\n\nI really enjoy the scenery, and I'", ", green, yellow, orange, purple, brown, black, white, gray.\n\nI'"] # fmt: skip + self.assertEqual(output_text, EXPECTED_COMPLETIONS) + + # Generation works beyond sliding window + self.assertGreater(out.shape[1], model.config.sliding_window) + self.assertEqual(out.shape[1], input_size + 5) + + # Note: Auto-inheritance only works for models saved starting from 4.50.0 + model.generation_config.transformers_version = "4.49.0" + with self.assertRaises(RuntimeError): # errors out because it is not using hybrid cache + out = model.generate(**inputs, generation_config=generation_config) diff --git a/docs/transformers/tests/models/gemma3/test_processing_gemma3.py b/docs/transformers/tests/models/gemma3/test_processing_gemma3.py new file mode 100644 index 0000000000000000000000000000000000000000..968a852d64edfb31bf8e519c519a68ede1f3a639 --- /dev/null +++ b/docs/transformers/tests/models/gemma3/test_processing_gemma3.py @@ -0,0 +1,151 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest +from typing import Optional + +from transformers import Gemma3Processor, GemmaTokenizer +from transformers.testing_utils import get_tests_dir, require_vision +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import Gemma3ImageProcessor + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + + +@require_vision +class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = Gemma3Processor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + gemma3_image_processor_kwargs = { + "do_pan_and_scan": True, + "pan_and_scan_min_crop_size": 256, + "pan_and_scan_max_num_crops": 4, + "pan_and_scan_min_ratio_to_activate": 1.2, + } + image_processor = Gemma3ImageProcessor.from_pretrained( + "google/siglip-so400m-patch14-384", **gemma3_image_processor_kwargs + ) + + extra_special_tokens = { + "image_token": "", + "boi_token": "", + "eoi_token": "", + } + tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True, extra_special_tokens=extra_special_tokens) + processor_kwargs = cls.prepare_processor_dict() + processor = Gemma3Processor(image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs) + processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.boi_token + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + # TODO: raushan or arthur: add the real chat template + @staticmethod + def prepare_processor_dict(): + return { + "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'model\n'}}\n{%- endif -%}\n", "image_seq_length": 3, + } # fmt: skip + + # Override as Gemma3 needs images to be an explicitly nested batch + def prepare_image_inputs(self, batch_size: Optional[int] = None): + """This function prepares a list of PIL images for testing""" + images = super().prepare_image_inputs(batch_size) + if isinstance(images, (list, tuple)): + images = [[image] for image in images] + return images + + def test_text_with_image_tokens(self): + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + text_multi_images = f"{processor.boi_token}{processor.boi_token}Dummy text!" + text_single_image = f"{processor.boi_token}Dummy text!" + text_no_image = "Dummy text!" + + image = self.prepare_image_inputs() + + # If text has no image tokens, iamge should be `None` + with self.assertRaises(ValueError): + _ = processor(text=text_no_image, images=image, return_tensors="np") + + # We can't be sure what is users intention: if user wants one image per text OR two images for first text and no image for second text + with self.assertRaises(ValueError): + _ = processor(text=[text_single_image, text_single_image], images=[image, image], return_tensors="np") + + # The users is expected to be explicit about which image belong to which text by nesting the images list + out_multiimages = processor(text=text_multi_images, images=[image, image], return_tensors="np") + out_batch_oneimage = processor( + text=[text_single_image, text_single_image], images=[[image], [image]], return_tensors="np" + ) + self.assertListEqual( + out_batch_oneimage[self.images_input_name].tolist(), out_multiimages[self.images_input_name].tolist() + ) + + def test_pan_and_scan(self): + processor_components = self.prepare_components() + processor_kwargs = self.prepare_processor_dict() + processor = self.processor_class(**processor_components, **processor_kwargs) + + input_str = self.prepare_text_inputs(modality="image") + image_input = self.prepare_image_inputs() + inputs = processor( + text=input_str, + images=image_input, + return_tensors="np", + do_pan_and_scan=True, + image_seq_length=2, + pan_and_scan_min_crop_size=10, + ) + + # base image + 4 crops + self.assertEqual(len(inputs[self.images_input_name]), 5) + self.assertEqual(len(inputs[self.text_input_name][0]), 67) + + def test_special_mm_token_truncation(self): + """Tests that special vision tokens do not get truncated when `truncation=True` is set.""" + + processor = self.get_processor() + + input_str = self.prepare_text_inputs(batch_size=2, modality="image") + image_input = self.prepare_image_inputs(batch_size=2) + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=None, + padding=True, + ) + + with self.assertRaises(ValueError): + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=True, + padding=True, + max_length=5, + ) diff --git a/docs/transformers/tests/models/git/__init__.py b/docs/transformers/tests/models/git/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/git/test_modeling_git.py b/docs/transformers/tests/models/git/test_modeling_git.py new file mode 100644 index 0000000000000000000000000000000000000000..38aa2b4e879c9a33e523a2f653f49df59fc1e3ed --- /dev/null +++ b/docs/transformers/tests/models/git/test_modeling_git.py @@ -0,0 +1,621 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import unittest + +from huggingface_hub import hf_hub_download + +from transformers import GitConfig, GitProcessor, GitVisionConfig, is_torch_available, is_vision_available +from transformers.models.auto import get_values +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, GitForCausalLM, GitModel, GitVisionModel + + +if is_vision_available(): + from PIL import Image + + +class GitVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=32, + patch_size=16, + num_channels=3, + is_training=True, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return GitVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = GitVisionModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class GitVisionModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as GIT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (GitVisionModel,) if is_torch_available() else () + fx_compatible = True + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = GitVisionModelTester(self) + self.config_tester = ConfigTester(self, config_class=GitVisionConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="GIT does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "microsoft/git-base" + model = GitVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class GitModelTester: + def __init__( + self, + parent, + num_channels=3, + image_size=32, + patch_size=16, + batch_size=13, + text_seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + num_labels=3, + scope=None, + ): + self.parent = parent + self.num_channels = num_channels + self.image_size = image_size + self.patch_size = patch_size + self.batch_size = batch_size + self.text_seq_length = text_seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.scope = scope + + # make sure the BOS, EOS and PAD tokens are within the vocab + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + # for GIT, the sequence length is the sum of the text and patch tokens, + 1 due to the CLS token + self.seq_length = self.text_seq_length + int((self.image_size / self.patch_size) ** 2) + 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.text_seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.text_seq_length]) + + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + config = self.get_config() + + return config, input_ids, input_mask, pixel_values + + def get_config(self): + """ + Returns a tiny configuration by default. + """ + return GitConfig( + vision_config={ + "num_channels": self.num_channels, + "image_size": self.image_size, + "patch_size": self.patch_size, + "hidden_size": self.hidden_size, + "projection_dim": 32, + "num_hidden_layers": self.num_hidden_layers, + "num_attention_heads": self.num_attention_heads, + }, + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + ) + + def create_and_check_model(self, config, input_ids, input_mask, pixel_values): + model = GitModel(config=config) + model.to(torch_device) + model.eval() + + # inference with pixel values + result = model(input_ids, attention_mask=input_mask, pixel_values=pixel_values) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + # inference without pixel values + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.text_seq_length, self.hidden_size) + ) + + def create_and_check_for_causal_lm(self, config, input_ids, input_mask, pixel_values): + model = GitForCausalLM(config=config) + model.to(torch_device) + model.eval() + + # inference with pixel values + result = model(input_ids, attention_mask=input_mask, pixel_values=pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + # inference without pixel values + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.text_seq_length, self.vocab_size)) + + # training + result = model(input_ids, attention_mask=input_mask, pixel_values=pixel_values, labels=input_ids) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertTrue(result.loss.item() > 0) + + def _test_beam_search_generate(self, config, input_ids, input_mask, pixel_values): + model = GitForCausalLM(config=config) + model.to(torch_device) + model.eval() + + # generate + generated_ids = model.generate( + input_ids, + attention_mask=input_mask, + pixel_values=pixel_values, + do_sample=False, + max_length=20, + num_beams=2, + num_return_sequences=2, + ) + + self.parent.assertEqual(generated_ids.shape, (self.batch_size * 2, 20)) + + def _test_batched_generate_captioning(self, config, input_ids, input_mask, pixel_values): + model = GitForCausalLM(config=config) + model.to(torch_device) + model.eval() + + # generate + generated_ids = model.generate( + input_ids=None, # captioning -> no input_ids + attention_mask=None, + pixel_values=pixel_values, + do_sample=False, + min_length=20, + max_length=20, + num_beams=2, + num_return_sequences=2, + ) + + self.parent.assertEqual(generated_ids.shape, (self.batch_size * 2, 20)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + pixel_values, + ) = config_and_inputs + + inputs_dict = { + "input_ids": input_ids, + "attention_mask": input_mask, + "pixel_values": pixel_values, + } + + return config, inputs_dict + + +@require_torch +class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": GitModel, + "image-to-text": GitForCausalLM, + "text-generation": GitForCausalLM, + "image-text-to-text": GitForCausalLM, + } + if is_torch_available() + else {} + ) + fx_compatible = False + test_torchscript = False + + # special case for GitForCausalLM model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class in get_values(MODEL_FOR_CAUSAL_LM_MAPPING): + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.text_seq_length), + dtype=torch.long, + device=torch_device, + ) + return inputs_dict + + def setUp(self): + self.model_tester = GitModelTester(self) + self.config_tester = ConfigTester(self, config_class=GitConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_causal_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_causal_lm(*config_and_inputs) + + def test_beam_search_generate(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester._test_beam_search_generate(*config_and_inputs) + + def test_batched_generate_captioning(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester._test_batched_generate_captioning(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def _check_attentions_for_generate( + self, batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values + ): + # GIT attention shape depends on image inputs, overwrite + image_length = int((config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1) + prompt_length += image_length + output_length += image_length + super()._check_attentions_for_generate( + batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values + ) + + def _check_hidden_states_for_generate( + self, batch_size, hidden_states, prompt_length, output_length, config, use_cache=False + ): + # GIT attention shape depends on image inputs, overwrite + image_length = int((config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1) + prompt_length += image_length + output_length += image_length + super()._check_hidden_states_for_generate( + batch_size, hidden_states, prompt_length, output_length, config, use_cache=use_cache + ) + + @slow + def test_model_from_pretrained(self): + model_name = "microsoft/git-base" + model = GitModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip(reason="GIT has pixel values as additional input") + def test_beam_search_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip(reason="GIT has pixel values as additional input") + def test_contrastive_generate(self): + pass + + @unittest.skip(reason="GIT has pixel values as additional input") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip(reason="GIT has pixel values as additional input") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip(reason="GIT has pixel values as additional input") + def test_greedy_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip(reason="GIT has pixel values as additional input") + def test_dola_decoding_sample(self): + pass + + +@require_torch +@require_vision +@slow +class GitModelIntegrationTest(unittest.TestCase): + def test_forward_pass(self): + processor = GitProcessor.from_pretrained("microsoft/git-base") + model = GitForCausalLM.from_pretrained("microsoft/git-base") + + model.to(torch_device) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(images=image, text="hello world", return_tensors="pt").to(torch_device) + + with torch.no_grad(): + outputs = model(**inputs) + + expected_shape = torch.Size((1, 201, 30522)) + self.assertEqual(outputs.logits.shape, expected_shape) + expected_slice = torch.tensor( + [[-0.9514, -0.9512, -0.9507], [-0.5454, -0.5453, -0.5453], [-0.8862, -0.8857, -0.8848]], + device=torch_device, + ) + torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + def test_inference_image_captioning(self): + processor = GitProcessor.from_pretrained("microsoft/git-base") + model = GitForCausalLM.from_pretrained("microsoft/git-base") + model.to(torch_device) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(images=image, return_tensors="pt") + pixel_values = inputs.pixel_values.to(torch_device) + + outputs = model.generate( + pixel_values=pixel_values, max_length=20, output_scores=True, return_dict_in_generate=True + ) + generated_caption = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0] + + expected_shape = torch.Size((1, 9)) + self.assertEqual(outputs.sequences.shape, expected_shape) + self.assertEqual(generated_caption, "two cats laying on a pink blanket") + self.assertTrue(outputs.scores[-1].shape, expected_shape) + expected_slice = torch.tensor([-0.8805, -0.8803, -0.8799], device=torch_device) + torch.testing.assert_close(outputs.scores[-1][0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + def test_visual_question_answering(self): + processor = GitProcessor.from_pretrained("microsoft/git-base-textvqa") + model = GitForCausalLM.from_pretrained("microsoft/git-base-textvqa") + model.to(torch_device) + + # prepare image + file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset") + image = Image.open(file_path).convert("RGB") + inputs = processor(images=image, return_tensors="pt") + pixel_values = inputs.pixel_values.to(torch_device) + + # prepare question + question = "what does the front of the bus say at the top?" + input_ids = processor(text=question, add_special_tokens=False).input_ids + input_ids = [processor.tokenizer.cls_token_id] + input_ids + input_ids = torch.tensor(input_ids).unsqueeze(0).to(torch_device) + + generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=20) + generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + + expected_shape = torch.Size((1, 15)) + self.assertEqual(generated_ids.shape, expected_shape) + self.assertEqual(generated_caption, "what does the front of the bus say at the top? special") + + def test_batched_generation(self): + processor = GitProcessor.from_pretrained("microsoft/git-base-coco") + model = GitForCausalLM.from_pretrained("microsoft/git-base-coco") + model.to(torch_device) + + # create batch of size 2 + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(images=[image, image], return_tensors="pt") + pixel_values = inputs.pixel_values.to(torch_device) + + # we have to prepare `input_ids` with the same batch size as `pixel_values` + start_token_id = model.config.bos_token_id + input_ids = torch.tensor([[start_token_id], [start_token_id]], device=torch_device) + generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50) + generated_captions = processor.batch_decode(generated_ids, skip_special_tokens=True) + + self.assertEqual(generated_captions, ["two cats sleeping on a pink blanket next to remotes."] * 2) + + @slow + def test_inference_interpolate_pos_encoding(self): + # CLIP family models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = GitModel.from_pretrained("microsoft/git-base").to(torch_device) + + processor = GitProcessor.from_pretrained( + "microsoft/git-base", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180} + ) + + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device) + + # interpolate_pos_encodiung false should return value error + with self.assertRaises(ValueError, msg="doesn't match model"): + with torch.no_grad(): + model(**inputs, interpolate_pos_encoding=False) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 130, 768)) + + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-1.0296, 2.5960, 0.8703], [1.7027, 1.3302, -0.4543], [-1.4932, -0.1084, 0.0502]] + ).to(torch_device) + + torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/git/test_processor_git.py b/docs/transformers/tests/models/git/test_processor_git.py new file mode 100644 index 0000000000000000000000000000000000000000..c15301a5875ac3a2df226d86fece3f975b2802c0 --- /dev/null +++ b/docs/transformers/tests/models/git/test_processor_git.py @@ -0,0 +1,146 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import shutil +import tempfile +import unittest + +import pytest + +from transformers.testing_utils import require_vision +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import AutoProcessor, BertTokenizer, CLIPImageProcessor, GitProcessor, PreTrainedTokenizerFast + + +@require_vision +class GitProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = GitProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + image_processor = CLIPImageProcessor() + tokenizer = BertTokenizer.from_pretrained( + "hf-internal-testing/tiny-random-BertModel", model_input_names=["input_ids", "attention_mask"] + ) + + processor = GitProcessor(image_processor, tokenizer) + + processor.save_pretrained(cls.tmpdirname) + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + def test_save_load_pretrained_additional_features(self): + with tempfile.TemporaryDirectory() as tmpdir: + processor = GitProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) + processor.save_pretrained(tmpdir) + + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + + processor = GitProcessor.from_pretrained( + tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, CLIPImageProcessor) + + def test_image_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) + + image_input = self.prepare_image_inputs() + + input_feat_extract = image_processor(image_input, return_tensors="np") + input_processor = processor(images=image_input, return_tensors="np") + + for key in input_feat_extract.keys(): + self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) + + def test_tokenizer(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + + encoded_processor = processor(text=input_str) + + encoded_tok = tokenizer(input_str, return_token_type_ids=False) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + def test_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"]) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + def test_tokenizer_decode(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + def test_model_input_names(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + # For now the processor supports only ['input_ids', 'attention_mask', 'pixel_values'] + self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"]) diff --git a/docs/transformers/tests/models/glm/__init__.py b/docs/transformers/tests/models/glm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/glm/test_modeling_glm.py b/docs/transformers/tests/models/glm/test_modeling_glm.py new file mode 100644 index 0000000000000000000000000000000000000000..9e8eda5cb239649ab853de1980e380686294fb02 --- /dev/null +++ b/docs/transformers/tests/models/glm/test_modeling_glm.py @@ -0,0 +1,429 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Glm model.""" + +import unittest + +import pytest + +from transformers import AutoModelForCausalLM, AutoTokenizer, GlmConfig, is_torch_available +from transformers.testing_utils import ( + is_flaky, + require_flash_attn, + require_torch, + require_torch_large_accelerator, + require_torch_sdpa, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + GlmForCausalLM, + GlmForSequenceClassification, + GlmForTokenClassification, + GlmModel, + ) + + +@require_torch +class GlmModelTester: + config_class = GlmConfig + if is_torch_available(): + model_class = GlmModel + for_causal_lm_class = GlmForCausalLM + for_sequence_class = GlmForSequenceClassification + for_token_class = GlmForTokenClassification + + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + num_key_value_heads=2, + intermediate_size=37, + hidden_act="silu", + attention_dropout=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + self.head_dim = self.hidden_size // self.num_attention_heads + + # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTester.prepare_config_and_inputs + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return self.config_class( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + num_key_value_heads=self.num_key_value_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + head_dim=self.head_dim, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = self.model_class(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Glm + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class GlmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + (GlmModel, GlmForCausalLM, GlmForSequenceClassification, GlmForTokenClassification) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": GlmModel, + "text-classification": GlmForSequenceClassification, + "token-classification": GlmForTokenClassification, + "text-generation": GlmForCausalLM, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + + def setUp(self): + self.model_tester = GlmModelTester(self) + self.config_tester = ConfigTester(self, config_class=GlmConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_Glm_sequence_classification_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + print(config) + config.num_labels = 3 + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = self.model_tester.for_sequence_class(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_Glm_sequence_classification_model_for_single_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "single_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = self.model_tester.for_sequence_class(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_Glm_sequence_classification_model_for_multi_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "multi_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor( + [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size + ).to(torch.float) + model = self.model_tester.for_sequence_class(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_Glm_token_classification_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels) + model = self.model_tester.for_token_class(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=token_labels) + self.assertEqual( + result.logits.shape, + (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels), + ) + + @is_flaky() + def test_custom_4d_attention_mask(self): + """Overwrite the common test to use atol=1e-3 instead of 1e-4. Can still rarely fail, thus flaky.""" + for model_class in self.all_generative_model_classes: + if not model_class._supports_static_cache: + self.skipTest(f"{model_class.__name__} is not guaranteed to work with custom 4D attention masks") + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0: + self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test") + model = model_class(config).to(device=torch_device, dtype=torch.float32) + + ( + input_ids, + position_ids, + input_ids_shared_prefix, + mask_shared_prefix, + position_ids_shared_prefix, + ) = self._get_custom_4d_mask_test_data() + + logits = model.forward(input_ids, position_ids=position_ids).logits + # logits.shape == torch.Size([3, 4, ...]) + + logits_shared_prefix = model( + input_ids_shared_prefix, + attention_mask=mask_shared_prefix, + position_ids=position_ids_shared_prefix, + )[0] + # logits_shared_prefix.shape == torch.Size([1, 6, ...]) + + out_last_tokens = logits[:, -1, :] # last tokens in each batch line + out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens + + # comparing softmax-normalized logits: + normalized_0 = torch.nn.functional.softmax(out_last_tokens) + normalized_1 = torch.nn.functional.softmax(out_shared_prefix_last_tokens) + print(torch.abs(normalized_0 - normalized_1).max()) + + torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-3) + + +@slow +@require_torch_large_accelerator +class GlmIntegrationTest(unittest.TestCase): + input_text = ["Hello I am doing", "Hi today"] + model_id = "THUDM/glm-4-9b" + revision = "refs/pr/15" + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + def test_model_9b_fp16(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, revision=self.revision + ).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_9b_bf16(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision=self.revision + ).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_9b_eager(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + attn_implementation="eager", + revision=self.revision, + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_torch_sdpa + def test_model_9b_sdpa(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + attn_implementation="sdpa", + revision=self.revision, + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_flash_attn + @pytest.mark.flash_attn_test + def test_model_9b_flash_attn(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", + revision=self.revision, + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) diff --git a/docs/transformers/tests/models/glm4/__init__.py b/docs/transformers/tests/models/glm4/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/glm4/test_modeling_glm4.py b/docs/transformers/tests/models/glm4/test_modeling_glm4.py new file mode 100644 index 0000000000000000000000000000000000000000..547b696867d2903473b1728922fec5baeecdd479 --- /dev/null +++ b/docs/transformers/tests/models/glm4/test_modeling_glm4.py @@ -0,0 +1,205 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Glm4 model.""" + +import unittest + +import pytest + +from transformers import AutoModelForCausalLM, AutoTokenizer, Glm4Config, is_torch_available +from transformers.testing_utils import ( + require_flash_attn, + require_torch, + require_torch_large_gpu, + require_torch_sdpa, + slow, + torch_device, +) + +from ...models.gemma.test_modeling_gemma import GemmaModelTest, GemmaModelTester +from ...test_configuration_common import ConfigTester + + +if is_torch_available(): + import torch + + from transformers import ( + Glm4ForCausalLM, + Glm4ForSequenceClassification, + Glm4ForTokenClassification, + Glm4Model, + ) + + +class Glm4ModelTester(GemmaModelTester): + if is_torch_available(): + config_class = Glm4Config + model_class = Glm4Model + for_causal_lm_class = Glm4ForCausalLM + for_sequence_class = Glm4ForSequenceClassification + for_token_class = Glm4ForTokenClassification + + +@require_torch +class Glm4ModelTest(GemmaModelTest, unittest.TestCase): + all_model_classes = ( + (Glm4Model, Glm4ForCausalLM, Glm4ForSequenceClassification, Glm4ForTokenClassification) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": Glm4Model, + "text-classification": Glm4ForSequenceClassification, + "token-classification": Glm4ForTokenClassification, + "text-generation": Glm4ForCausalLM, + "zero-shot": Glm4ForSequenceClassification, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + _is_stateful = True + model_split_percents = [0.5, 0.6] + + def setUp(self): + self.model_tester = Glm4ModelTester(self) + self.config_tester = ConfigTester(self, config_class=Glm4Config, hidden_size=37) + + +@slow +@require_torch_large_gpu +class Glm4IntegrationTest(unittest.TestCase): + input_text = ["Hello I am doing", "Hi today"] + model_id = "THUDM/glm-4-0414-9b-chat" + revision = "refs/pr/15" + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + def test_model_9b_fp16(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16, revision=self.revision + ).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_9b_bf16(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision=self.revision + ).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + def test_model_9b_eager(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + attn_implementation="eager", + revision=self.revision, + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_torch_sdpa + def test_model_9b_sdpa(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + attn_implementation="sdpa", + revision=self.revision, + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_flash_attn + @pytest.mark.flash_attn_test + def test_model_9b_flash_attn(self): + EXPECTED_TEXTS = [ + "Hello I am doing a project on the history of the internetSolution:\n\nStep 1: Introduction\nThe history of the", + "Hi today I am going to show you how to make a simple and easy to make a DIY paper flower.", + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", + revision=self.revision, + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id, revision=self.revision) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) diff --git a/docs/transformers/tests/models/glpn/__init__.py b/docs/transformers/tests/models/glpn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/glpn/test_image_processing_glpn.py b/docs/transformers/tests/models/glpn/test_image_processing_glpn.py new file mode 100644 index 0000000000000000000000000000000000000000..7f6a960755e724298d27a31e3cb768635d7ff0e4 --- /dev/null +++ b/docs/transformers/tests/models/glpn/test_image_processing_glpn.py @@ -0,0 +1,163 @@ +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import GLPNImageProcessor + + +class GLPNImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size_divisor=32, + do_rescale=True, + ): + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size_divisor = size_divisor + self.do_rescale = do_rescale + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size_divisor": self.size_divisor, + "do_rescale": self.do_rescale, + } + + def expected_output_image_shape(self, images): + if isinstance(images[0], Image.Image): + width, height = images[0].size + elif isinstance(images[0], np.ndarray): + height, width = images[0].shape[0], images[0].shape[1] + else: + height, width = images[0].shape[1], images[0].shape[2] + + height = height // self.size_divisor * self.size_divisor + width = width // self.size_divisor * self.size_divisor + + return self.num_channels, height, width + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + size_divisor=self.size_divisor, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = GLPNImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = GLPNImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size_divisor")) + self.assertTrue(hasattr(image_processing, "resample")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + + def test_call_pil(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input (GLPNImageProcessor doesn't support batching) + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape)) + + def test_call_numpy(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input (GLPNImageProcessor doesn't support batching) + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape)) + + def test_call_pytorch(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input (GLPNImageProcessor doesn't support batching) + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape)) + + def test_call_numpy_4_channels(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + self.image_processing_class.num_channels = 4 + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input (GLPNImageProcessor doesn't support batching) + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape)) + self.image_processing_class.num_channels = 3 diff --git a/docs/transformers/tests/models/glpn/test_modeling_glpn.py b/docs/transformers/tests/models/glpn/test_modeling_glpn.py new file mode 100644 index 0000000000000000000000000000000000000000..94f357455b7de9643aa673d6fd50c200e3576326 --- /dev/null +++ b/docs/transformers/tests/models/glpn/test_modeling_glpn.py @@ -0,0 +1,345 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch GLPN model.""" + +import unittest + +from transformers import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNModel + from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES + + +if is_vision_available(): + from PIL import Image + + from transformers import GLPNImageProcessor + + +class GLPNConfigTester(ConfigTester): + def create_and_test_config_common_properties(self): + config = self.config_class(**self.inputs_dict) + self.parent.assertTrue(hasattr(config, "hidden_sizes")) + self.parent.assertTrue(hasattr(config, "num_attention_heads")) + self.parent.assertTrue(hasattr(config, "num_encoder_blocks")) + + +class GLPNModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=64, + num_channels=3, + num_encoder_blocks=4, + depths=[2, 2, 2, 2], + sr_ratios=[8, 4, 2, 1], + hidden_sizes=[16, 32, 64, 128], + downsampling_rates=[1, 4, 8, 16], + num_attention_heads=[1, 2, 4, 8], + is_training=True, + use_labels=True, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + decoder_hidden_size=16, + num_labels=3, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.num_encoder_blocks = num_encoder_blocks + self.sr_ratios = sr_ratios + self.depths = depths + self.hidden_sizes = hidden_sizes + self.downsampling_rates = downsampling_rates + self.num_attention_heads = num_attention_heads + self.is_training = is_training + self.use_labels = use_labels + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.decoder_hidden_size = decoder_hidden_size + self.num_labels = num_labels + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return GLPNConfig( + image_size=self.image_size, + num_channels=self.num_channels, + num_encoder_blocks=self.num_encoder_blocks, + depths=self.depths, + hidden_sizes=self.hidden_sizes, + num_attention_heads=self.num_attention_heads, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + initializer_range=self.initializer_range, + decoder_hidden_size=self.decoder_hidden_size, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = GLPNModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + expected_height = expected_width = self.image_size // (self.downsampling_rates[-1] * 2) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.hidden_sizes[-1], expected_height, expected_width) + ) + + def create_and_check_for_depth_estimation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = GLPNForDepthEstimation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class GLPNModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (GLPNModel, GLPNForDepthEstimation) if is_torch_available() else () + pipeline_model_mapping = ( + {"depth-estimation": GLPNForDepthEstimation, "image-feature-extraction": GLPNModel} + if is_torch_available() + else {} + ) + + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = GLPNModelTester(self) + self.config_tester = GLPNConfigTester(self, config_class=GLPNConfig) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_depth_estimation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + + @unittest.skip(reason="GLPN does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="GLPN does not have get_input_embeddings method and get_output_embeddings methods") + def test_model_get_set_embeddings(self): + pass + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + + expected_num_attentions = sum(self.model_tester.depths) + self.assertEqual(len(attentions), expected_num_attentions) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + + self.assertEqual(len(attentions), expected_num_attentions) + + # verify the first attentions (first block, first layer) + expected_seq_len = (self.model_tester.image_size // 4) ** 2 + expected_reduced_seq_len = (self.model_tester.image_size // (4 * self.model_tester.sr_ratios[0])) ** 2 + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads[0], expected_seq_len, expected_reduced_seq_len], + ) + + # verify the last attentions (last block, last layer) + expected_seq_len = (self.model_tester.image_size // 32) ** 2 + expected_reduced_seq_len = (self.model_tester.image_size // (32 * self.model_tester.sr_ratios[-1])) ** 2 + self.assertListEqual( + list(attentions[-1].shape[-3:]), + [self.model_tester.num_attention_heads[-1], expected_seq_len, expected_reduced_seq_len], + ) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + 1, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), expected_num_attentions) + # verify the first attentions (first block, first layer) + expected_seq_len = (self.model_tester.image_size // 4) ** 2 + expected_reduced_seq_len = (self.model_tester.image_size // (4 * self.model_tester.sr_ratios[0])) ** 2 + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads[0], expected_seq_len, expected_reduced_seq_len], + ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = self.model_tester.num_encoder_blocks + self.assertEqual(len(hidden_states), expected_num_layers) + + # verify the first hidden states (first block) + self.assertListEqual( + list(hidden_states[0].shape[-3:]), + [ + self.model_tester.hidden_sizes[0], + self.model_tester.image_size // 4, + self.model_tester.image_size // 4, + ], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_training(self): + if not self.model_tester.is_training: + self.skipTest(reason="model_tester.is_training is set to False") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + if model_class.__name__ in MODEL_MAPPING_NAMES.values(): + continue + # TODO: remove the following 3 lines once we have a MODEL_FOR_DEPTH_ESTIMATION_MAPPING + # this can then be incorporated into _prepare_for_class in test_modeling_common.py + if model_class.__name__ == "GLPNForDepthEstimation": + batch_size, num_channels, height, width = inputs_dict["pixel_values"].shape + inputs_dict["labels"] = torch.zeros( + [self.model_tester.batch_size, height, width], device=torch_device + ).long() + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + @slow + def test_model_from_pretrained(self): + model_name = "vinvino02/glpn-kitti" + model = GLPNModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +@slow +class GLPNModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_depth_estimation(self): + image_processor = GLPNImageProcessor.from_pretrained("vinvino02/glpn-kitti") + model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the predicted depth + expected_shape = torch.Size([1, 480, 640]) + self.assertEqual(outputs.predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[3.4291, 2.7865, 2.5151], [3.2841, 2.7021, 2.3502], [3.1147, 2.4625, 2.2481]] + ).to(torch_device) + + torch.testing.assert_close(outputs.predicted_depth[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/got_ocr2/__init__.py b/docs/transformers/tests/models/got_ocr2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/got_ocr2/test_image_processing_got_ocr2.py b/docs/transformers/tests/models/got_ocr2/test_image_processing_got_ocr2.py new file mode 100644 index 0000000000000000000000000000000000000000..a47dccf0631cd3ea90e8fa58b6b517e533c918bd --- /dev/null +++ b/docs/transformers/tests/models/got_ocr2/test_image_processing_got_ocr2.py @@ -0,0 +1,177 @@ +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.image_utils import SizeDict +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from transformers import GotOcr2ImageProcessor + + if is_torchvision_available(): + from transformers import GotOcr2ImageProcessorFast + + +class GotOcr2ImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + do_pad=False, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + do_convert_rgb=True, + ): + super().__init__() + size = size if size is not None else {"height": 20, "width": 20} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_pad = do_pad + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + "do_pad": self.do_pad, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class GotOcr2ProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = GotOcr2ImageProcessor if is_vision_available() else None + fast_image_processing_class = GotOcr2ImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = GotOcr2ImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processor, "do_resize")) + self.assertTrue(hasattr(image_processor, "size")) + self.assertTrue(hasattr(image_processor, "do_normalize")) + self.assertTrue(hasattr(image_processor, "image_mean")) + self.assertTrue(hasattr(image_processor, "image_std")) + self.assertTrue(hasattr(image_processor, "do_convert_rgb")) + + def test_slow_fast_equivalence_crop_to_patches(self): + dummy_image = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)[0] + + image_processor_slow = self.image_processing_class(**self.image_processor_dict, crop_to_patches=True) + image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict, crop_to_patches=True) + + encoding_slow = image_processor_slow(dummy_image, return_tensors="pt") + encoding_fast = image_processor_fast(dummy_image, return_tensors="pt") + + torch.testing.assert_close(encoding_slow.num_patches, encoding_fast.num_patches) + self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1)) + self.assertLessEqual( + torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3 + ) + + def test_slow_fast_equivalence_batched_crop_to_patches(self): + # Prepare image inputs so that we have two groups of images with equal resolution with a group of images with + # different resolutions in between + dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True) + dummy_images += self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + dummy_images += self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True) + + image_processor_slow = self.image_processing_class(**self.image_processor_dict, crop_to_patches=True) + image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict, crop_to_patches=True) + + encoding_slow = image_processor_slow(dummy_images, return_tensors="pt") + encoding_fast = image_processor_fast(dummy_images, return_tensors="pt") + + torch.testing.assert_close(encoding_slow.num_patches, encoding_fast.num_patches) + self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1)) + self.assertLessEqual( + torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3 + ) + + def test_crop_to_patches(self): + # test slow image processor + image_processor = self.image_processor_list[0](**self.image_processor_dict) + image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)[0] + processed_images = image_processor.crop_image_to_patches( + image, + min_patches=1, + max_patches=6, + use_thumbnail=True, + patch_size={"height": 20, "width": 20}, + ) + self.assertEqual(len(processed_images), 5) + self.assertEqual(processed_images[0].shape[:2], (20, 20)) + + # test fast image processor (process batch) + image_processor = self.image_processor_list[1](**self.image_processor_dict) + image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)[0] + processed_images = image_processor.crop_image_to_patches( + image.unsqueeze(0), + min_patches=1, + max_patches=6, + use_thumbnail=True, + patch_size=SizeDict(height=20, width=20), + ) + self.assertEqual(len(processed_images[0]), 5) + self.assertEqual(processed_images.shape[-2:], (20, 20)) diff --git a/docs/transformers/tests/models/got_ocr2/test_modeling_got_ocr2.py b/docs/transformers/tests/models/got_ocr2/test_modeling_got_ocr2.py new file mode 100644 index 0000000000000000000000000000000000000000..ed0a25f7b19952fa547fb7054a4f73f5869ef68b --- /dev/null +++ b/docs/transformers/tests/models/got_ocr2/test_modeling_got_ocr2.py @@ -0,0 +1,347 @@ +# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch GotOcr2 model.""" + +import unittest + +from transformers import ( + AutoProcessor, + GotOcr2Config, + is_torch_available, + is_vision_available, +) +from transformers.testing_utils import cleanup, require_torch, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + GotOcr2ForConditionalGeneration, + ) + + +if is_vision_available(): + from transformers.image_utils import load_image + + +class GotOcr2VisionText2TextModelTester: + def __init__( + self, + parent, + batch_size=3, + seq_length=7, + num_channels=3, + ignore_index=-100, + image_size=64, + bos_token_id=0, + eos_token_id=0, + pad_token_id=0, + image_token_index=1, + model_type="got_ocr2", + is_training=True, + text_config={ + "model_type": "qwen2", + "vocab_size": 99, + "hidden_size": 128, + "intermediate_size": 37, + "num_hidden_layers": 4, + "num_attention_heads": 4, + "num_key_value_heads": 2, + "output_channels": 64, + "hidden_act": "silu", + "max_position_embeddings": 512, + "rope_theta": 10000, + "mlp_ratio": 4, + "tie_word_embeddings": True, + }, + vision_config={ + "num_hidden_layers": 2, + "output_channels": 64, + "hidden_act": "quick_gelu", + "hidden_size": 32, + "mlp_dim": 128, + "num_attention_heads": 4, + "patch_size": 2, + "image_size": 64, + }, + ): + self.parent = parent + self.ignore_index = ignore_index + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.image_token_index = image_token_index + self.model_type = model_type + self.text_config = text_config + self.vision_config = vision_config + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.is_training = is_training + self.num_image_tokens = 64 + self.seq_length = seq_length + self.num_image_tokens + + self.num_hidden_layers = text_config["num_hidden_layers"] + self.vocab_size = text_config["vocab_size"] + self.hidden_size = text_config["hidden_size"] + self.num_attention_heads = text_config["num_attention_heads"] + + def get_config(self): + return GotOcr2Config( + text_config=self.text_config, + vision_config=self.vision_config, + model_type=self.model_type, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + image_token_index=self.image_token_index, + ) + + def prepare_config_and_inputs(self): + config = self.get_config() + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) + + # input_ids[:, -1] = self.pad_token_id + input_ids[input_ids == self.image_token_index] = self.pad_token_id + input_ids[:, : self.num_image_tokens] = self.image_token_index + + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + +@require_torch +class GotOcr2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (GotOcr2ForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = ( + { + "image-to-text": GotOcr2ForConditionalGeneration, + "image-text-to-text": GotOcr2ForConditionalGeneration, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + + def setUp(self): + self.model_tester = GotOcr2VisionText2TextModelTester(self) + self.config_tester = ConfigTester(self, config_class=GotOcr2Config, has_text_modality=False) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + # while some other models require pixel_values to be present + def test_inputs_embeds_matches_input_ids(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + inputs_embeds = model.get_input_embeddings()(input_ids) + + with torch.no_grad(): + out_ids = model(input_ids=input_ids, **inputs)[0] + out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] + torch.testing.assert_close(out_embeds, out_ids) + + @unittest.skip( + reason="VLMs can't generate from inputs embeds and pixels. This can be tested as part of bacbone LM, no need to run the test for VLMs" + ) + def test_generate_from_inputs_embeds_with_static_cache(self): + pass + + @unittest.skip( + reason="GotOcr2's language backbone is Qwen2 which uses GQA so the KV cache is a non standard format" + ) + def test_past_key_values_format(self): + pass + + +@require_torch +class GotOcr2IntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf") + + def tearDown(self): + cleanup(torch_device, gc_collect=True) + + @slow + def test_small_model_integration_test_got_ocr_stop_strings(self): + model_id = "stepfun-ai/GOT-OCR-2.0-hf" + model = GotOcr2ForConditionalGeneration.from_pretrained(model_id, device_map=torch_device) + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/fixtures_ocr/resolve/main/iam_picture.jpeg" + ) + + inputs = self.processor(image, return_tensors="pt").to(torch_device) + generate_ids = model.generate( + **inputs, + do_sample=False, + num_beams=1, + tokenizer=self.processor.tokenizer, + stop_strings="<|im_end|>", + max_new_tokens=4096, + ) + decoded_output = self.processor.decode( + generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True + ) + expected_output = "industre" + self.assertEqual(decoded_output, expected_output) + + @slow + def test_small_model_integration_test_got_ocr_format(self): + model_id = "stepfun-ai/GOT-OCR-2.0-hf" + model = GotOcr2ForConditionalGeneration.from_pretrained(model_id, device_map=torch_device) + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg" + ) + + inputs = self.processor(image, return_tensors="pt", format=True).to(torch_device) + generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4) + decoded_output = self.processor.decode( + generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True + ) + expected_output = "\\title{\nR" + self.assertEqual(decoded_output, expected_output) + + @slow + def test_small_model_integration_test_got_ocr_fine_grained(self): + model_id = "stepfun-ai/GOT-OCR-2.0-hf" + model = GotOcr2ForConditionalGeneration.from_pretrained(model_id, device_map=torch_device) + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png" + ) + + inputs = self.processor(image, return_tensors="pt", color="green").to(torch_device) + generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4) + decoded_output = self.processor.decode( + generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True + ) + expected_output = "You should keep in" + self.assertEqual(decoded_output, expected_output) + + @slow + def test_small_model_integration_test_got_ocr_crop_to_patches(self): + model_id = "stepfun-ai/GOT-OCR-2.0-hf" + model = GotOcr2ForConditionalGeneration.from_pretrained(model_id, device_map=torch_device) + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/one_column.png" + ) + + inputs = self.processor(image, return_tensors="pt", crop_to_patches=True).to(torch_device) + generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4) + decoded_output = self.processor.decode( + generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True + ) + expected_output = "on developing architectural improvements" + self.assertEqual(decoded_output, expected_output) + + @slow + def test_small_model_integration_test_got_ocr_multi_pages(self): + model_id = "stepfun-ai/GOT-OCR-2.0-hf" + model = GotOcr2ForConditionalGeneration.from_pretrained(model_id, device_map=torch_device) + image1 = load_image( + "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/one_column.png" + ) + image2 = load_image( + "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png" + ) + + inputs = self.processor([image1, image2], return_tensors="pt", multi_page=True).to(torch_device) + generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4) + decoded_output = self.processor.decode( + generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True + ) + expected_output = "on developing architectural improvements" + self.assertEqual(decoded_output, expected_output) + + @slow + def test_small_model_integration_test_got_ocr_batched(self): + model_id = "stepfun-ai/GOT-OCR-2.0-hf" + model = GotOcr2ForConditionalGeneration.from_pretrained(model_id, device_map=torch_device) + image1 = load_image( + "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png" + ) + image2 = load_image( + "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg" + ) + + inputs = self.processor([image1, image2], return_tensors="pt").to(torch_device) + generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4) + decoded_output = self.processor.batch_decode( + generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True + ) + expected_output = ["Reducing the number", "R&D QUALITY"] + self.assertEqual(decoded_output, expected_output) diff --git a/docs/transformers/tests/models/got_ocr2/test_processor_got_ocr2.py b/docs/transformers/tests/models/got_ocr2/test_processor_got_ocr2.py new file mode 100644 index 0000000000000000000000000000000000000000..0719d211ddad3e33b6304b75bbfa244794bd9e6f --- /dev/null +++ b/docs/transformers/tests/models/got_ocr2/test_processor_got_ocr2.py @@ -0,0 +1,80 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest + +from transformers import AutoProcessor, GotOcr2Processor, PreTrainedTokenizerFast +from transformers.testing_utils import require_vision +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import GotOcr2ImageProcessor + + +@require_vision +class GotOcr2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = GotOcr2Processor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + image_processor = GotOcr2ImageProcessor() + tokenizer = PreTrainedTokenizerFast.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf") + processor_kwargs = {} + processor = GotOcr2Processor(image_processor, tokenizer, **processor_kwargs) + processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.img_pad_token + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + def test_ocr_queries(self): + processor = self.get_processor() + image_input = self.prepare_image_inputs() + inputs = processor(image_input, return_tensors="pt") + self.assertEqual(inputs["input_ids"].shape, (1, 286)) + self.assertEqual(inputs["pixel_values"].shape, (1, 3, 384, 384)) + + inputs = processor(image_input, return_tensors="pt", format=True) + self.assertEqual(inputs["input_ids"].shape, (1, 288)) + self.assertEqual(inputs["pixel_values"].shape, (1, 3, 384, 384)) + + inputs = processor(image_input, return_tensors="pt", color="red") + self.assertEqual(inputs["input_ids"].shape, (1, 290)) + self.assertEqual(inputs["pixel_values"].shape, (1, 3, 384, 384)) + + inputs = processor(image_input, return_tensors="pt", box=[0, 0, 100, 100]) + self.assertEqual(inputs["input_ids"].shape, (1, 303)) + self.assertEqual(inputs["pixel_values"].shape, (1, 3, 384, 384)) + + inputs = processor([image_input, image_input], return_tensors="pt", multi_page=True, format=True) + self.assertEqual(inputs["input_ids"].shape, (1, 547)) + self.assertEqual(inputs["pixel_values"].shape, (2, 3, 384, 384)) + + inputs = processor(image_input, return_tensors="pt", crop_to_patches=True, max_patches=6) + self.assertEqual(inputs["input_ids"].shape, (1, 1826)) + self.assertEqual(inputs["pixel_values"].shape, (7, 3, 384, 384)) diff --git a/docs/transformers/tests/models/gpt2/__init__.py b/docs/transformers/tests/models/gpt2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/gpt2/test_modeling_flax_gpt2.py b/docs/transformers/tests/models/gpt2/test_modeling_flax_gpt2.py new file mode 100644 index 0000000000000000000000000000000000000000..3297a3c45db79b847a1e4350cf11be636f331955 --- /dev/null +++ b/docs/transformers/tests/models/gpt2/test_modeling_flax_gpt2.py @@ -0,0 +1,254 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers import GPT2Config, GPT2Tokenizer, is_flax_available +from transformers.testing_utils import require_flax, slow + +from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_flax_available(): + import jax + import jax.numpy as jnp + + from transformers.models.gpt2.modeling_flax_gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model + + +class FlaxGPT2ModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = None + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + config = GPT2Config( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + n_positions=self.max_position_embeddings, + use_cache=False, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + ) + + return (config, input_ids, input_mask) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + return config, inputs_dict + + def prepare_config_and_inputs_for_decoder(self): + config, input_ids, attention_mask = self.prepare_config_and_inputs() + + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + + def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask): + max_decoder_length = 20 + model = model_class_name(config) + + past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length) + attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4") + + position_ids = jnp.broadcast_to( + jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1) + ) + outputs_cache = model( + input_ids[:, :-1], + attention_mask=attention_mask, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4") + outputs_cache_next = model( + input_ids[:, -1:], + attention_mask=attention_mask, + past_key_values=outputs_cache.past_key_values, + position_ids=position_ids, + ) + + outputs = model(input_ids) + + diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])) + self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}") + + def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask): + max_decoder_length = 20 + model = model_class_name(config) + + attention_mask_cache = jnp.concatenate( + [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))], + axis=-1, + ) + + past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length) + position_ids = jnp.broadcast_to( + jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1) + ) + + outputs_cache = model( + input_ids[:, :-1], + attention_mask=attention_mask_cache, + past_key_values=past_key_values, + position_ids=position_ids, + ) + position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4") + outputs_cache_next = model( + input_ids[:, -1:], + past_key_values=outputs_cache.past_key_values, + attention_mask=attention_mask_cache, + position_ids=position_ids, + ) + + outputs = model(input_ids, attention_mask=attention_mask) + + diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])) + self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}") + + def check_bool_attention_mask_in_generation(self, model_class_name, config, input_ids, attention_mask): + model = model_class_name(config) + + output_int_att_mask = model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_new_tokens=3, + ) + + output_bool_att_mask = model.generate( + input_ids=input_ids, + attention_mask=attention_mask.astype(bool), + max_new_tokens=3, + ) + + self.parent.assertTrue( + (output_bool_att_mask.sequences == output_int_att_mask.sequences).all(), + "Generated response differ between boolean and integer attention mask", + ) + + +@require_flax +class FlaxGPT2ModelTest(FlaxModelTesterMixin, unittest.TestCase): + all_model_classes = (FlaxGPT2Model, FlaxGPT2LMHeadModel) if is_flax_available() else () + + def setUp(self): + self.model_tester = FlaxGPT2ModelTester(self) + + def test_use_cache_forward(self): + for model_class_name in self.all_model_classes: + config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask) + + def test_use_cache_forward_with_attn_mask(self): + for model_class_name in self.all_model_classes: + config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_use_cache_forward_with_attn_mask( + model_class_name, config, input_ids, attention_mask + ) + + def test_bool_attention_mask_in_generation(self): + for model_class_name in self.all_generative_model_classes: + config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_bool_attention_mask_in_generation( + model_class_name, config, input_ids, attention_mask + ) + + @slow + def test_batch_generation(self): + tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2", pad_token="", padding_side="left") + inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True) + + model = FlaxGPT2LMHeadModel.from_pretrained("openai-community/gpt2") + model.do_sample = False + model.config.pad_token_id = model.config.eos_token_id + + jit_generate = jax.jit(model.generate) + + output_sequences = jit_generate(inputs["input_ids"], attention_mask=inputs["attention_mask"]).sequences + + output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True) + + expected_string = [ + "Hello this is a long string of words. I'm going to start with the first one.\n", + "Hey, I'm not sure if I'm going to be able to do", + ] + + self.assertListEqual(output_string, expected_string) + + @slow + def test_model_from_pretrained(self): + for model_class_name in self.all_model_classes: + model = model_class_name.from_pretrained("openai-community/gpt2", from_pt=True) + outputs = model(np.ones((1, 1))) + self.assertIsNotNone(outputs) diff --git a/docs/transformers/tests/models/gpt2/test_modeling_gpt2.py b/docs/transformers/tests/models/gpt2/test_modeling_gpt2.py new file mode 100644 index 0000000000000000000000000000000000000000..44e342539b8602f0f82179a93b443053de7a25b8 --- /dev/null +++ b/docs/transformers/tests/models/gpt2/test_modeling_gpt2.py @@ -0,0 +1,897 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +import unittest + +import pytest + +from transformers import GPT2Config, is_torch_available +from transformers.testing_utils import ( + cleanup, + require_flash_attn, + require_torch, + require_torch_gpu, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + GPT2DoubleHeadsModel, + GPT2ForQuestionAnswering, + GPT2ForSequenceClassification, + GPT2ForTokenClassification, + GPT2LMHeadModel, + GPT2Model, + GPT2Tokenizer, + ) + + +class GPT2ModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.use_mc_token_ids = use_mc_token_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = None + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + def get_large_model_config(self): + return GPT2Config.from_pretrained("openai-community/gpt2") + + def prepare_config_and_inputs( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config( + gradient_checkpointing=gradient_checkpointing, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + ) + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def get_config( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + return GPT2Config( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + n_inner=self.intermediate_size, + activation_function=self.hidden_act, + resid_pdrop=self.hidden_dropout_prob, + attn_pdrop=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + use_cache=True, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + gradient_checkpointing=gradient_checkpointing, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + ) + + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPT2Model(config=config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(len(result.past_key_values), config.n_layer) + + def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPT2Model(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) + outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids) + outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False) + + self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) + self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + + output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] + output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_gpt2_model_attention_mask_past( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = GPT2Model(config=config) + model.to(torch_device) + model.eval() + + # create attention mask + attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) + half_seq_length = self.seq_length // 2 + attn_mask[:, half_seq_length:] = 0 + + # first forward pass + output, past = model(input_ids, attention_mask=attn_mask).to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + # change a random masked slice from input_ids + random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1 + random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1) + input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens + + # append to next input_ids and attn_mask + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + attn_mask = torch.cat( + [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], + dim=1, + ) + + # get two different outputs + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_gpt2_model_past_large_inputs( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = GPT2Model(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model( + next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask + )["last_hidden_state"] + output_from_past = model( + next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past + )["last_hidden_state"] + self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1]) + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPT2LMHeadModel(config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_forward_and_backwards( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False + ): + model = GPT2LMHeadModel(config) + model.to(torch_device) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + result.loss.backward() + + def create_and_check_double_lm_head_model( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args + ): + model = GPT2DoubleHeadsModel(config) + model.to(torch_device) + model.eval() + + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + + inputs = { + "input_ids": multiple_choice_inputs_ids, + "mc_token_ids": mc_token_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + "labels": multiple_choice_inputs_ids, + } + + result = model(**inputs) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size) + ) + self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_gpt2_for_question_answering( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = GPT2ForQuestionAnswering(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_gpt2_for_sequence_classification( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = GPT2ForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_gpt2_for_token_classification( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = GPT2ForTokenClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_gpt2_weight_initialization(self, config, *args): + model = GPT2Model(config) + model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer) + for key in model.state_dict().keys(): + if "c_proj" in key and "weight" in key: + self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001) + self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01) + + def create_and_check_cached_forward_with_and_without_attention_mask(self, config, input_ids, *args): + # Relevant issue: https://github.com/huggingface/transformers/issues/31943 + model = GPT2Model(config) + model.to(torch_device) + model.eval() + + # We want this for SDPA, eager works with a `None` attention mask + assert model.config._attn_implementation == "sdpa", ( + "This test assumes the model to have the SDPA implementation for its attention calculations." + ) + + # Prepare cache and non_cache input, needs a full attention mask + cached_len = input_ids.shape[-1] // 2 + input_mask = torch.ones(size=input_ids.size()).to(torch_device) + cache_inputs = {"input_ids": input_ids[:, :cached_len], "attention_mask": input_mask[:, :cached_len]} + non_cache_inputs = {"input_ids": input_ids[:, cached_len:], "attention_mask": input_mask} + + # Cached forward once with the attention mask provided and the other time without it (which should assume full attention) + cache_outputs = model(**cache_inputs) + full_outputs_with_attention_mask = model( + **non_cache_inputs, past_key_values=cache_outputs.past_key_values + ).last_hidden_state + full_outputs_without_attention_mask = model( + non_cache_inputs["input_ids"], past_key_values=cache_outputs.past_key_values + ).last_hidden_state + + self.parent.assertTrue( + torch.allclose(full_outputs_with_attention_mask, full_outputs_without_attention_mask, atol=1e-5) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "head_mask": head_mask, + } + + return config, inputs_dict + + +@require_torch +class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + GPT2Model, + GPT2LMHeadModel, + GPT2DoubleHeadsModel, + GPT2ForQuestionAnswering, + GPT2ForSequenceClassification, + GPT2ForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": GPT2Model, + "question-answering": GPT2ForQuestionAnswering, + "text-classification": GPT2ForSequenceClassification, + "text-generation": GPT2LMHeadModel, + "token-classification": GPT2ForTokenClassification, + "zero-shot": GPT2ForSequenceClassification, + } + if is_torch_available() + else {} + ) + all_parallelizable_model_classes = (GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else () + fx_compatible = False # Broken by attention refactor cc @Cyrilvallez + test_missing_keys = False + test_model_parallel = True + + # special case for DoubleHeads model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "GPT2DoubleHeadsModel": + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.num_choices, self.model_tester.seq_length), + dtype=torch.long, + device=torch_device, + ) + inputs_dict["input_ids"] = inputs_dict["labels"] + inputs_dict["token_type_ids"] = inputs_dict["labels"] + inputs_dict["mc_token_ids"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.num_choices), + dtype=torch.long, + device=torch_device, + ) + inputs_dict["mc_labels"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + return inputs_dict + + def setUp(self): + self.model_tester = GPT2ModelTester(self) + self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37) + + def tearDown(self): + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_gpt2_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_model(*config_and_inputs) + + def test_gpt2_model_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs) + + def test_gpt2_model_att_mask_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_model_attention_mask_past(*config_and_inputs) + + def test_gpt2_model_past_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_model_past_large_inputs(*config_and_inputs) + + def test_gpt2_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + def test_gpt2_double_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs) + + def test_gpt2_question_answering_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_for_question_answering(*config_and_inputs) + + def test_gpt2_sequence_classification_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_for_sequence_classification(*config_and_inputs) + + def test_gpt2_token_classification_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_for_token_classification(*config_and_inputs) + + def test_gpt2_gradient_checkpointing(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True) + + def test_gpt2_scale_attn_by_inverse_layer_idx(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs(scale_attn_by_inverse_layer_idx=True) + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs) + + def test_gpt2_reorder_and_upcast_attn(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs(reorder_and_upcast_attn=True) + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs) + + def test_gpt2_weight_initialization(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_weight_initialization(*config_and_inputs) + + def test_cached_forward_with_and_without_attention_mask(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_cached_forward_with_and_without_attention_mask(*config_and_inputs) + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_batch_generation(self): + model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2") + model.to(torch_device) + tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2") + + tokenizer.padding_side = "left" + + # Define PAD Token = EOS Token = 50256 + tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = model.config.eos_token_id + + # use different length sentences to test batching + sentences = [ + "Hello, my dog is a little", + "Today, I", + ] + + inputs = tokenizer(sentences, return_tensors="pt", padding=True) + input_ids = inputs["input_ids"].to(torch_device) + token_type_ids = torch.cat( + [ + input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0), + input_ids.new_full((input_ids.shape[0], 1), 500), + ], + dim=-1, + ) + + outputs = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + max_length=20, + ) + + outputs_tt = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + token_type_ids=token_type_ids, + max_length=20, + ) + + inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device) + output_non_padded = model.generate(input_ids=inputs_non_padded, max_length=20) + + num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item() + inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device) + output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) + + batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) + batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True) + non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) + padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) + + expected_output_sentence = [ + "Hello, my dog is a little bit of a mess. I'm not sure if he's going", + "Today, I'm going to be doing a lot of research on this. I", + ] + self.assertListEqual(expected_output_sentence, batch_out_sentence) + self.assertTrue(batch_out_sentence_tt != batch_out_sentence) # token_type_ids should change output + self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) + + @slow + def test_batch_generation_2heads(self): + model = GPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2") + model.to(torch_device) + tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2") + + tokenizer.padding_side = "left" + + # This tokenizer has no pad token, so we have to set it in some way + # Define PAD Token = EOS Token = 50256 + tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = model.config.eos_token_id + + # use different length sentences to test batching + sentences = [ + "Hello, my dog is a little", + "Today, I", + ] + + inputs = tokenizer(sentences, return_tensors="pt", padding=True) + input_ids = inputs["input_ids"].to(torch_device) + token_type_ids = torch.cat( + [ + input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0), + input_ids.new_full((input_ids.shape[0], 1), 500), + ], + dim=-1, + ) + + outputs = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + max_length=20, + ) + + outputs_tt = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + token_type_ids=token_type_ids, + max_length=20, + ) + + inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device) + output_non_padded = model.generate(input_ids=inputs_non_padded, max_length=20) + + num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item() + inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device) + output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) + + batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) + batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True) + non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) + padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) + + expected_output_sentence = [ + "Hello, my dog is a little bit of a mess. I'm not sure if he's going", + "Today, I'm going to be doing a lot of research on this. I", + ] + self.assertListEqual(expected_output_sentence, batch_out_sentence) + self.assertTrue(batch_out_sentence_tt != batch_out_sentence) # token_type_ids should change output + self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) + + @slow + def test_model_from_pretrained(self): + model_name = "openai-community/gpt2" + model = GPT2Model.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_torch +class GPT2ModelLanguageGenerationTest(unittest.TestCase): + def tearDown(self): + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device, gc_collect=True) + + def _test_lm_generate_gpt2_helper( + self, + gradient_checkpointing=False, + reorder_and_upcast_attn=False, + scale_attn_by_inverse_layer_idx=False, + verify_outputs=True, + ): + model = GPT2LMHeadModel.from_pretrained( + "openai-community/gpt2", + reorder_and_upcast_attn=reorder_and_upcast_attn, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + ) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + else: + model.gradient_checkpointing_disable() + model.to(torch_device) + + # The dog + input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device) + + # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog + expected_output_ids = [464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290,] # fmt: skip + output_ids = model.generate(input_ids, do_sample=False, max_length=20) + if verify_outputs: + self.assertListEqual(output_ids[0].tolist(), expected_output_ids) + + @slow + def test_lm_generate_gpt2(self): + self._test_lm_generate_gpt2_helper() + + @slow + def test_lm_generate_gpt2_with_gradient_checkpointing(self): + self._test_lm_generate_gpt2_helper(gradient_checkpointing=True) + + @slow + def test_lm_generate_gpt2_with_reorder_and_upcast_attn(self): + self._test_lm_generate_gpt2_helper(reorder_and_upcast_attn=True) + + @slow + def test_lm_generate_gpt2_with_scale_attn_by_inverse_layer_idx(self): + self._test_lm_generate_gpt2_helper(scale_attn_by_inverse_layer_idx=True, verify_outputs=False) + + @slow + def test_gpt2_sample(self): + tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2") + model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2") + model.to(torch_device) + + torch.manual_seed(0) + tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True) + input_ids = tokenized.input_ids.to(torch_device) + output_ids = model.generate(input_ids, do_sample=True, max_length=20) + output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True) + + token_type_ids = tokenized.token_type_ids.to(torch_device) + output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5, max_length=20) + output_seq_tt = model.generate( + input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5, max_length=20 + ) + output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True) + output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True) + + EXPECTED_OUTPUT_STR = ( + "Today is a nice day and if you don't know anything about the state of play during your holiday" + ) + self.assertEqual(output_str, EXPECTED_OUTPUT_STR) + self.assertTrue( + all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))) + ) # token_type_ids should change output + + @slow + def test_contrastive_search_gpt2(self): + article = ( + "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research " + "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based" + ) + + gpt2_tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-large") + gpt2_model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large").to(torch_device) + input_ids = gpt2_tokenizer(article, return_tensors="pt").input_ids.to(torch_device) + + outputs = gpt2_model.generate(input_ids, penalty_alpha=0.6, top_k=4, max_length=256) + + generated_text = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True) + + self.assertListEqual( + generated_text, + [ + "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research " + "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based in London, " + "United Kingdom\n\nGoogle has a lot of data on its users and uses it to improve its products, such as " + "Google Now, which helps users find the information they're looking for on the web. But the company " + "is not the only one to collect data on its users. Facebook, for example, has its own facial " + "recognition technology, as well as a database of millions of photos that it uses to personalize its " + "News Feed.\n\nFacebook's use of data is a hot topic in the tech industry, with privacy advocates " + "concerned about the company's ability to keep users' information private. In a blog post last " + 'year, Facebook CEO Mark Zuckerberg said his company would "do our best to be transparent about our ' + 'data use and how we use it."\n\n"We have made it clear that we do not sell or share your data with ' + 'third parties," Zuckerberg wrote. "If you have questions or concerns, please reach out to us at ' + 'privacy@facebook.com."\n\nGoogle declined to comment on the privacy implications of its use of data, ' + "but said in a statement to The Associated Press that" + ], + ) + + @require_flash_attn + @require_torch_gpu + @pytest.mark.flash_attn_test + @slow + def test_flash_attn_2_generate_padding_left(self): + """ + Overwriting the common test as the test is flaky on tiny models + """ + model = GPT2LMHeadModel.from_pretrained("gpt2", torch_dtype=torch.float16).to(0) + + tokenizer = GPT2Tokenizer.from_pretrained("gpt2") + + texts = ["hi", "Hello this is a very long sentence"] + + tokenizer.padding_side = "left" + tokenizer.pad_token = tokenizer.eos_token + + inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) + + output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_native = tokenizer.batch_decode(output_native) + + model = GPT2LMHeadModel.from_pretrained( + "gpt2", device_map={"": 0}, attn_implementation="flash_attention_2", torch_dtype=torch.float16 + ) + + output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_fa_2 = tokenizer.batch_decode(output_fa_2) + + expected_output = [ + "<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>hi, who was born in the city of Kolkata, was a member of the Kolkata", + "Hello this is a very long sentence. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry", + ] + + self.assertListEqual(output_native, output_fa_2) + self.assertListEqual(output_native, expected_output) diff --git a/docs/transformers/tests/models/gpt2/test_modeling_tf_gpt2.py b/docs/transformers/tests/models/gpt2/test_modeling_tf_gpt2.py new file mode 100644 index 0000000000000000000000000000000000000000..76ecd6d15bc251cca051a37571ceefadf84e3bdf --- /dev/null +++ b/docs/transformers/tests/models/gpt2/test_modeling_tf_gpt2.py @@ -0,0 +1,732 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +from transformers import GPT2Config, is_tf_available +from transformers.testing_utils import require_tf, require_tf2onnx, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin +from ...utils.test_modeling_tf_core import TFCoreModelTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import GPT2Tokenizer + from transformers.models.gpt2.modeling_tf_gpt2 import ( + TFGPT2DoubleHeadsModel, + TFGPT2ForSequenceClassification, + TFGPT2LMHeadModel, + TFGPT2Model, + ) + from transformers.tf_utils import shape_list + + +class TFGPT2ModelTester: + def __init__( + self, + parent, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_token_type_ids = True + self.use_input_mask = True + self.use_labels = True + self.use_mc_token_ids = True + self.vocab_size = 99 + self.hidden_size = 32 + self.num_hidden_layers = 2 + self.num_attention_heads = 4 + self.intermediate_size = 37 + self.hidden_act = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.scope = None + self.bos_token_id = self.vocab_size - 1 + self.eos_token_id = self.vocab_size - 1 + self.pad_token_id = self.vocab_size - 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = GPT2Config( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + # intermediate_size=self.intermediate_size, + # hidden_act=self.hidden_act, + # hidden_dropout_prob=self.hidden_dropout_prob, + # attention_probs_dropout_prob=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + # type_vocab_size=self.type_vocab_size, + # initializer_range=self.initializer_range + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + return_dict=True, + ) + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) + + def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = TFGPT2Model(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + + inputs = [input_ids, None, input_mask] # None is the input for 'past' + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = TFGPT2Model(config=config) + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) + outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids) + outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False) + + self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) + self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) + + output, past_key_values = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size) + + # append to next input_ids and token_type_ids + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1) + + output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] + output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past_key_values)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx] + output_from_past_slice = output_from_past[:, 0, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6) + + def create_and_check_gpt2_model_attention_mask_past( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = TFGPT2Model(config=config) + + # create attention mask + half_seq_length = self.seq_length // 2 + attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32) + attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32) + attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1) + + # first forward pass + output, past_key_values = model(input_ids, attention_mask=attn_mask).to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + # change a random masked slice from input_ids + random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1 + random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size) + vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change) + condition = tf.transpose( + tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size)) + ) + input_ids = tf.where(condition, random_other_next_tokens, input_ids) + + # append to next input_ids and attn_mask + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1) + + # get two different outputs + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx] + output_from_past_slice = output_from_past[:, 0, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12) + + def create_and_check_gpt2_model_past_large_inputs( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = TFGPT2Model(config=config) + + input_ids = input_ids[:1, :] + input_mask = input_mask[:1, :] + token_type_ids = token_type_ids[:1, :] + self.batch_size = 1 + + # first forward pass + outputs = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, use_cache=True) + + output, past_key_values = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_attn_mask = ids_tensor((self.batch_size, 3), 2) + next_token_types = ids_tensor((self.batch_size, 3), self.type_vocab_size) + + # append to next input_ids and token_type_ids + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1) + next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1) + + output_from_no_past = model( + next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask + )["last_hidden_state"] + output_from_past = model( + next_tokens, + token_type_ids=next_token_types, + attention_mask=next_attention_mask, + past_key_values=past_key_values, + )["last_hidden_state"] + self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1]) + + # select random slice + random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx] + output_from_past_slice = output_from_past[:, :, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3) + + def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = TFGPT2LMHeadModel(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_gpt2_double_head( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args + ): + model = TFGPT2DoubleHeadsModel(config=config) + + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + + inputs = { + "input_ids": multiple_choice_inputs_ids, + "mc_token_ids": mc_token_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + result = model(inputs) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size) + ) + self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_gpt2_for_sequence_classification( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + "labels": sequence_labels, + } + model = TFGPT2ForSequenceClassification(config) + + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "attention_mask": input_mask, + } + return config, inputs_dict + + +@require_tf +class TFGPT2ModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2ForSequenceClassification, TFGPT2DoubleHeadsModel) + if is_tf_available() + else () + ) + all_generative_model_classes = (TFGPT2LMHeadModel,) if is_tf_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": TFGPT2Model, + "text-classification": TFGPT2ForSequenceClassification, + "text-generation": TFGPT2LMHeadModel, + "zero-shot": TFGPT2ForSequenceClassification, + } + if is_tf_available() + else {} + ) + test_head_masking = False + test_onnx = True + onnx_min_opset = 10 + + def setUp(self): + self.model_tester = TFGPT2ModelTester(self) + self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_gpt2_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_model(*config_and_inputs) + + def test_gpt2_model_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs) + + def test_gpt2_model_att_mask_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_model_attention_mask_past(*config_and_inputs) + + def test_gpt2_model_past_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_model_past_large_inputs(*config_and_inputs) + + def test_gpt2_lm_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs) + + def test_gpt2_double_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs) + + def test_gpt2_sequence_classification_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt2_for_sequence_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "openai-community/gpt2" + model = TFGPT2Model.from_pretrained(model_name) + self.assertIsNotNone(model) + + # overwrite from common since ONNX runtime optimization doesn't work with tf.gather() when the argument + # `batch_dims` > 0" + @require_tf2onnx + @slow + def test_onnx_runtime_optimize(self): + if not self.test_onnx: + return + + import onnxruntime + import tf2onnx + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + # Skip these 2 classes which uses `tf.gather` with `batch_dims=1` + if model_class in [TFGPT2ForSequenceClassification, TFGPT2DoubleHeadsModel]: + continue + + model = model_class(config) + model.build_in_name_scope() + + onnx_model_proto, _ = tf2onnx.convert.from_keras(model, opset=self.onnx_min_opset) + + onnxruntime.InferenceSession(onnx_model_proto.SerializeToString()) + + # TODO (Joao): fix me + @unittest.skip("Onnx compliance broke with TF 2.10") + def test_onnx_compliancy(self): + pass + + +@require_tf +class TFGPT2ModelLanguageGenerationTest(unittest.TestCase): + @slow + def test_lm_generate_greedy_distilgpt2_batch_special(self): + model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2") + tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2") + + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + + sentences = ["Today is a beautiful day and", "Yesterday was"] + input_ids = tokenizer(sentences, return_tensors="tf", padding=True) + + generation_kwargs = { + "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], + "no_repeat_ngram_size": 2, + "do_sample": False, + "repetition_penalty": 1.3, + } + + output_ids = model.generate(**input_ids, **generation_kwargs) + + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + expected_output_string = [ + "Today is a beautiful day and I am so happy to be able take part in this amazing event.", + "Yesterday was a very interesting time for the world to see how much of this is", + ] + self.assertListEqual(output_strings, expected_output_string) + + @slow + def test_lm_generate_sample_distilgpt2_batch_special(self): + model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2") + tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2") + + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + + sentences = ["Today is a beautiful day and", "Yesterday was"] + input_ids = tokenizer(sentences, return_tensors="tf", padding=True) + + generation_kwargs = { + "do_sample": True, + "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], + "no_repeat_ngram_size": 2, + "repetition_penalty": 1.3, + "temperature": 1.5, + "top_k": 500, + "top_p": 0.9, + "seed": [42, 0], # seed set -> deterministic sampling sequence -> deterministic generation + } + + # forces the generation to happen on CPU, to avoid GPU-related quirks + with tf.device(":/CPU:0"): + output_ids = model.generate(**input_ids, **generation_kwargs) + + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + + expected_output_string = [ + "Today is a beautiful day and we will make you feel very hot/terrific in all your", + "Yesterday was known by national television networks as Le Big Show or Wild Dog Jeopard", + ] + self.assertListEqual(output_strings, expected_output_string) + + @slow + def test_lm_generate_greedy_distilgpt2_beam_search_special(self): + model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2") + tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2") + + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + + sentences = ["Today is a beautiful day and", "Yesterday was"] + input_ids = tokenizer(sentences, return_tensors="tf", padding=True) + + generation_kwargs = { + "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], + "no_repeat_ngram_size": 2, + "do_sample": False, + "num_beams": 2, + } + + output_ids = model.generate(**input_ids, **generation_kwargs) + + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + expected_output_string = [ + "Today is a beautiful day and a great day for all of us.\n\nI’m", + "Yesterday was the first time that a person has been arrested in the United States for", + ] + self.assertListEqual(output_strings, expected_output_string) + + @slow + def test_lm_generate_distilgpt2_left_padding(self): + """Tests that the generated text is the same, regardless of left padding""" + model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2") + tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2") + + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + + generation_kwargs = { + "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], + "no_repeat_ngram_size": 2, + "do_sample": False, + "repetition_penalty": 1.3, + } + expected_output_string = ( + "Today is a beautiful day and I am so happy to be able take part in this amazing event." + ) + + sentences = ["Today is a beautiful day and"] + input_ids = tokenizer(sentences, return_tensors="tf", padding=True) + # using default length + output_ids = model.generate(**input_ids, **generation_kwargs) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertEqual(output_strings[0], expected_output_string) + + sentences = ["Today is a beautiful day and", "This is a very long input that we absolutely don't care about"] + input_ids = tokenizer(sentences, return_tensors="tf", padding=True) + # longer max length to capture the full length (remember: it is left padded) + output_ids = model.generate(**input_ids, **generation_kwargs, max_length=27) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertEqual(output_strings[0], expected_output_string) + + @slow + def test_lm_generate_gpt2_greedy_xla(self): + model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2") + tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2") + + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + + sentences = ["The dog", "The flying machine"] + expected_output_strings = [ + "The dog was found in a field near the intersection of West and West Streets.\n\nThe", + "The flying machine is a small, lightweight, and lightweight aircraft that can be used for any type of", + ] + input_ids = tokenizer(sentences, return_tensors="tf", padding=True) + + output_ids = model.generate(**input_ids, do_sample=False) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertListEqual(output_strings, expected_output_strings) + + xla_generate = tf.function(model.generate, jit_compile=True) + output_ids = xla_generate(**input_ids, do_sample=False) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertListEqual(output_strings, expected_output_strings) + + @slow + def test_lm_generate_gpt2_sample_xla(self): + # NOTE: due to the small numerical differences that are natural when we compile to XLA, sampling the same + # output out of the same seed is far from guaranteed. We can, however, confirm that the results are sensible + # and that we can seed both versions. + + # forces the generation to happen on CPU, to avoid GPU-related quirks + with tf.device(":/CPU:0"): + model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2") + tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2") + + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + + sentence = ["The dog", "The flying machine"] + expected_output_string = [ + "The dog owner asked why did our vet decide there needed to be extra ventilation inside because most" + " puppies", + "The flying machine was made by an artist who found it difficult to control it as it did not use", + ] + expected_output_string_xla = [ + "The dog has been named in connection with the murder of a 20-year-old man in", + "The flying machine is a new and improved system to operate and operate a new system and system " + "system system", + ] + input_ids = tokenizer(sentence, return_tensors="tf", padding=True) + + output_ids = model.generate(**input_ids, do_sample=True, seed=[7, 0]) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertListEqual(output_strings, expected_output_string) + + xla_generate = tf.function(model.generate, jit_compile=True) + output_ids = xla_generate(**input_ids, do_sample=True, seed=[7, 0]) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertListEqual(output_strings, expected_output_string_xla) + + @slow + def test_lm_generate_gpt2_beam_search_xla(self): + model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2") + tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2") + + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + + sentences = ["The dog", "The flying machine"] + expected_output_strings = [ + "The dog was found in the backyard of a home in the 6500 block of South Main Street", + "The flying machine is a very powerful machine, but it's not a very powerful machine. It's", + ] + input_ids = tokenizer(sentences, return_tensors="tf", padding=True) + + output_ids = model.generate(**input_ids, do_sample=False, num_beams=2) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertListEqual(output_strings, expected_output_strings) + + xla_generate = tf.function(model.generate, jit_compile=True) + output_ids = xla_generate(**input_ids, do_sample=False, num_beams=2) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertListEqual(output_strings, expected_output_strings) + + @slow + def test_contrastive_search_gpt2(self): + article = ( + "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research " + "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based" + ) + + gpt2_tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-large") + gpt2_model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2-large") + input_ids = gpt2_tokenizer(article, return_tensors="tf") + + outputs = gpt2_model.generate(**input_ids, penalty_alpha=0.6, top_k=4, max_length=256) + + generated_text = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True) + + self.assertListEqual( + generated_text, + [ + "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research " + "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based in London, " + "United Kingdom\n\nGoogle has a lot of data on its users and uses it to improve its products, such as " + "Google Now, which helps users find the information they're looking for on the web. But the company " + "is not the only one to collect data on its users. Facebook, for example, has its own facial " + "recognition technology, as well as a database of millions of photos that it uses to personalize its " + "News Feed.\n\nFacebook's use of data is a hot topic in the tech industry, with privacy advocates " + "concerned about the company's ability to keep users' information private. In a blog post last " + 'year, Facebook CEO Mark Zuckerberg said his company would "do our best to be transparent about our ' + 'data use and how we use it."\n\n"We have made it clear that we do not sell or share your data with ' + 'third parties," Zuckerberg wrote. "If you have questions or concerns, please reach out to us at ' + 'privacy@facebook.com."\n\nGoogle declined to comment on the privacy implications of its use of data, ' + "but said in a statement to The Associated Press that" + ], + ) + + @slow + def test_contrastive_search_gpt2_xla(self): + article = ( + "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research " + "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based" + ) + + gpt2_tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-large") + gpt2_model = TFGPT2LMHeadModel.from_pretrained("openai-community/gpt2-large") + input_ids = gpt2_tokenizer(article, return_tensors="tf") + + xla_generate = tf.function(gpt2_model.generate, jit_compile=True) + outputs = xla_generate(**input_ids, penalty_alpha=0.6, top_k=4, max_length=256) + + generated_text = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True) + + self.assertListEqual( + generated_text, + [ + "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research " + "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based in London, " + "United Kingdom\n\nGoogle has a lot of data on its users and uses it to improve its products, such as " + "Google Now, which helps users find the information they're looking for on the web. But the company " + "is not the only one to collect data on its users. Facebook, for example, has its own facial " + "recognition technology, as well as a database of millions of photos that it uses to personalize its " + "News Feed.\n\nFacebook's use of data is a hot topic in the tech industry, with privacy advocates " + "concerned about the company's ability to keep users' information private. In a blog post last " + 'year, Facebook CEO Mark Zuckerberg said his company would "do our best to be transparent about our ' + 'data use and how we use it."\n\n"We have made it clear that we do not sell or share your data with ' + 'third parties," Zuckerberg wrote. "If you have questions or concerns, please reach out to us at ' + 'privacy@facebook.com."\n\nGoogle declined to comment on the privacy implications of its use of data, ' + "but said in a statement to The Associated Press that" + ], + ) diff --git a/docs/transformers/tests/models/gpt2/test_tokenization_gpt2.py b/docs/transformers/tests/models/gpt2/test_tokenization_gpt2.py new file mode 100644 index 0000000000000000000000000000000000000000..0e954a8c6ca3b25032700d0613501cd9f58b2454 --- /dev/null +++ b/docs/transformers/tests/models/gpt2/test_tokenization_gpt2.py @@ -0,0 +1,381 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import unittest +from functools import lru_cache + +from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast +from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES +from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers + +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible + + +@require_tokenizers +class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openai-community/gpt2" + tokenizer_class = GPT2Tokenizer + rust_tokenizer_class = GPT2TokenizerFast + test_rust_tokenizer = True + from_pretrained_kwargs = {"add_prefix_space": True} + test_seq2seq = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt + vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "\u0120", + "\u0120l", + "\u0120n", + "\u0120lo", + "\u0120low", + "er", + "\u0120lowest", + "\u0120newer", + "\u0120wider", + "", + "<|endoftext|>", + ] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] + cls.special_tokens_map = {"unk_token": ""} + + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + with open(cls.merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return GPT2Tokenizer.from_pretrained(pretrained_name, **kwargs) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return GPT2TokenizerFast.from_pretrained(pretrained_name, **kwargs) + + def get_input_output_texts(self, tokenizer): + input_text = "lower newer" + output_text = "lower newer" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) + text = "lower newer" + bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] + tokens = tokenizer.tokenize(text, add_prefix_space=True) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [tokenizer.unk_token] + input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + def test_rust_and_python_full_tokenizers(self): + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True) + + sequence = "lower newer" + + # Testing tokenization + tokens = tokenizer.tokenize(sequence, add_prefix_space=True) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(tokens, rust_tokens) + + # Testing conversion to ids without special tokens + ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True) + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + # Testing conversion to ids with special tokens + rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True) + ids = tokenizer.encode(sequence, add_prefix_space=True) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + # Testing the unknown token + input_tokens = tokens + [rust_tokenizer.unk_token] + input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] + self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + @unittest.skip + def test_pretokenized_inputs(self, *args, **kwargs): + # It's very difficult to mix/test pretokenization with byte-level + # And get both GPT2 and Roberta to work at the same time (mostly an issue of adding a space before the string) + pass + + def test_padding(self, max_length=15): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + + # Simple input + s = "This is a simple input" + s2 = ["This is a simple input 1", "This is a simple input 2"] + p = ("This is a simple input", "This is a pair") + p2 = [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ] + + # Simple input tests + self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + s2, + max_length=max_length, + padding="max_length", + ) + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + p2, + max_length=max_length, + padding="max_length", + ) + + def test_padding_if_pad_token_set_slow(self): + tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname, pad_token="") + + # Simple input + s = "This is a simple input" + s2 = ["This is a simple input looooooooong", "This is a simple input"] + p = ("This is a simple input", "This is a pair") + p2 = [ + ("This is a simple input loooooong", "This is a simple input"), + ("This is a simple pair loooooong", "This is a simple pair"), + ] + + pad_token_id = tokenizer.pad_token_id + + out_s = tokenizer(s, padding="max_length", max_length=30, return_tensors="np") + out_s2 = tokenizer(s2, padding=True, truncate=True, return_tensors="np") + out_p = tokenizer(*p, padding="max_length", max_length=60, return_tensors="np") + out_p2 = tokenizer(p2, padding=True, truncate=True, return_tensors="np") + + # s + # test single string max_length padding + self.assertEqual(out_s["input_ids"].shape[-1], 30) + self.assertTrue(pad_token_id in out_s["input_ids"]) + self.assertTrue(0 in out_s["attention_mask"]) + + # s2 + # test automatic padding + self.assertEqual(out_s2["input_ids"].shape[-1], 33) + # long slice doesn't have padding + self.assertFalse(pad_token_id in out_s2["input_ids"][0]) + self.assertFalse(0 in out_s2["attention_mask"][0]) + # short slice does have padding + self.assertTrue(pad_token_id in out_s2["input_ids"][1]) + self.assertTrue(0 in out_s2["attention_mask"][1]) + + # p + # test single pair max_length padding + self.assertEqual(out_p["input_ids"].shape[-1], 60) + self.assertTrue(pad_token_id in out_p["input_ids"]) + self.assertTrue(0 in out_p["attention_mask"]) + + # p2 + # test automatic padding pair + self.assertEqual(out_p2["input_ids"].shape[-1], 52) + # long slice pair doesn't have padding + self.assertFalse(pad_token_id in out_p2["input_ids"][0]) + self.assertFalse(0 in out_p2["attention_mask"][0]) + # short slice pair does have padding + self.assertTrue(pad_token_id in out_p2["input_ids"][1]) + self.assertTrue(0 in out_p2["attention_mask"][1]) + + def test_add_bos_token_slow(self): + bos_token = "$$$" + tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname, bos_token=bos_token, add_bos_token=True) + + s = "This is a simple input" + s2 = ["This is a simple input 1", "This is a simple input 2"] + + bos_token_id = tokenizer.bos_token_id + + out_s = tokenizer(s) + out_s2 = tokenizer(s2) + + self.assertEqual(out_s.input_ids[0], bos_token_id) + self.assertTrue(all(o[0] == bos_token_id for o in out_s2.input_ids)) + + decode_s = tokenizer.decode(out_s.input_ids) + decode_s2 = tokenizer.batch_decode(out_s2.input_ids) + + self.assertTrue(decode_s.startswith(bos_token)) + self.assertTrue(all(d.startswith(bos_token) for d in decode_s2)) + + @unittest.skip(reason="tokenizer has no padding token") + def test_padding_different_model_input_name(self): + pass + + def test_special_tokens_mask_input_pairs_and_bos_token(self): + # TODO: change to self.get_tokenizers() when the fast version is implemented + tokenizers = [self.get_tokenizer(do_lower_case=False, add_bos_token=True)] + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + sequence_0 = "Encode this." + sequence_1 = "This one too please." + encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False) + encoded_sequence_dict = tokenizer.encode_plus( + sequence_0, + sequence_1, + add_special_tokens=True, + return_special_tokens_mask=True, + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + + filtered_sequence = [ + (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) + ] + filtered_sequence = [x for x in filtered_sequence if x is not None] + self.assertEqual(encoded_sequence, filtered_sequence) + + @require_jinja + def test_tokenization_for_chat(self): + tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname) + tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}" + test_chats = [ + [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], + [ + {"role": "system", "content": "You are a helpful chatbot."}, + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Nice to meet you."}, + ], + [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}], + ] + tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] + # fmt: off + expected_tokens = [[20, 1, 20, 10, 20, 4, 3, 10, 20, 10, 20, 3, 0, 20, 20, 20, 0, 10, 20, 20, 20, 6, 20, 1, 6, 20, 20, 20, 3, 0, 0, 1, 20, 20], + [20, 1, 20, 10, 20, 4, 3, 10, 20, 10, 20, 3, 0, 20, 20, 20, 0, 10, 20, 20, 20, 6, 20, 1, 6, 20, 20, 20, 3, 0, 0, 1, 20, 20, 20, 7, 20, 3, 10, 6, 1, 10, 20, 3, 3, 6, 10, 20, 1, 20, 20, 20], + [20, 7, 20, 3, 10, 6, 1, 10, 20, 3, 3, 6, 10, 20, 1, 20, 20, 20, 20, 3, 0, 0, 1, 20, 20]] + # fmt: on + for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): + self.assertListEqual(tokenized_chat, expected_tokens) + + @require_tiktoken + def test_tokenization_tiktoken(self): + from tiktoken import encoding_name_for_model + + from transformers.integrations.tiktoken import convert_tiktoken_to_fast + + encoding = encoding_name_for_model("gpt2") + convert_tiktoken_to_fast(encoding, self.tmpdirname) + + tiktoken_fast_tokenizer = GPT2TokenizerFast.from_pretrained(self.tmpdirname) + rust_tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2") + sequence = "lower newer" + self.assertEqual( + rust_tokenizer.decode(rust_tokenizer.encode(sequence)), + tiktoken_fast_tokenizer.decode(rust_tokenizer.encode(sequence)), + ) + + +@require_tokenizers +class OPTTokenizationTest(unittest.TestCase): + def test_serialize_deserialize_fast_opt(self): + # More context: + # https://huggingface.co/wjmcat/opt-350m-paddle/discussions/1 + # https://huggingface.slack.com/archives/C01N44FJDHT/p1653511495183519 + # https://github.com/huggingface/transformers/pull/17088#discussion_r871246439 + + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", from_slow=True) + text = "A photo of a cat" + + tokens_ids = tokenizer.encode( + text, + ) + self.assertEqual(tokens_ids, [2, 250, 1345, 9, 10, 4758]) + tokenizer.save_pretrained("test_opt") + + tokenizer = AutoTokenizer.from_pretrained("./test_opt") + tokens_ids = tokenizer.encode( + text, + ) + self.assertEqual(tokens_ids, [2, 250, 1345, 9, 10, 4758]) + + def test_fast_slow_equivalence(self): + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", use_slow=True) + text = "A photo of a cat" + + tokens_ids = tokenizer.encode( + text, + ) + # Same as above + self.assertEqual(tokens_ids, [2, 250, 1345, 9, 10, 4758]) + + @unittest.skip(reason="This test is failing because of a bug in the fast tokenizer") + def test_users_can_modify_bos(self): + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", from_slow=True) + + tokenizer.bos_token = "bos" + tokenizer.bos_token_id = tokenizer.get_vocab()["bos"] + + text = "A photo of a cat" + tokens_ids = tokenizer.encode( + text, + ) + # We changed the bos token + self.assertEqual(tokens_ids, [31957, 250, 1345, 9, 10, 4758]) + tokenizer.save_pretrained("./tok") + tokenizer = AutoTokenizer.from_pretrained("./tok") + self.assertTrue(tokenizer.is_fast) + tokens_ids = tokenizer.encode( + text, + ) + self.assertEqual(tokens_ids, [31957, 250, 1345, 9, 10, 4758]) diff --git a/docs/transformers/tests/models/gpt2/test_tokenization_gpt2_tf.py b/docs/transformers/tests/models/gpt2/test_tokenization_gpt2_tf.py new file mode 100644 index 0000000000000000000000000000000000000000..06f16c36e31711552490602618d0234fbc24354e --- /dev/null +++ b/docs/transformers/tests/models/gpt2/test_tokenization_gpt2_tf.py @@ -0,0 +1,131 @@ +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory + +from transformers import AutoConfig, TFGPT2LMHeadModel, is_keras_nlp_available, is_tf_available +from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer +from transformers.testing_utils import require_keras_nlp, require_tf, slow + + +if is_tf_available(): + import tensorflow as tf + + +if is_keras_nlp_available(): + from transformers.models.gpt2 import TFGPT2Tokenizer + + +TOKENIZER_CHECKPOINTS = ["openai-community/gpt2"] +TINY_MODEL_CHECKPOINT = "openai-community/gpt2" + +if is_tf_available(): + + class ModelToSave(tf.Module): + def __init__(self, tokenizer): + super().__init__() + self.tokenizer = tokenizer + config = AutoConfig.from_pretrained(TINY_MODEL_CHECKPOINT) + self.model = TFGPT2LMHeadModel.from_config(config) + + @tf.function(input_signature=(tf.TensorSpec((None,), tf.string, name="text"),)) + def serving(self, text): + tokenized = self.tokenizer(text) + input_ids_dense = tokenized["input_ids"].to_tensor() + + input_mask = tf.cast(input_ids_dense > 0, tf.int32) + # input_mask = tf.reshape(input_mask, [-1, MAX_SEQ_LEN]) + + outputs = self.model(input_ids=input_ids_dense, attention_mask=input_mask)["logits"] + + return outputs + + +@require_tf +@require_keras_nlp +class GPTTokenizationTest(unittest.TestCase): + # The TF tokenizers are usually going to be used as pretrained tokenizers from existing model checkpoints, + # so that's what we focus on here. + + def setUp(self): + super().setUp() + + self.tokenizers = [GPT2Tokenizer.from_pretrained(checkpoint) for checkpoint in (TOKENIZER_CHECKPOINTS)] + self.tf_tokenizers = [TFGPT2Tokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS] + assert len(self.tokenizers) == len(self.tf_tokenizers) + + self.test_sentences = [ + "This is a straightforward English test sentence.", + "This one has some weird characters\rto\nsee\r\nif those\u00e9break things.", + "Now we're going to add some Chinese: 一 二 三 一二三", + "And some much more rare Chinese: 齉 堃 齉堃", + "Je vais aussi écrire en français pour tester les accents", + "Classical Irish also has some unusual characters, so in they go: Gaelaċ, ꝼ", + ] + self.paired_sentences = list(zip(self.test_sentences, self.test_sentences[::-1])) + + def test_output_equivalence(self): + for tokenizer, tf_tokenizer in zip(self.tokenizers, self.tf_tokenizers): + for test_inputs in self.test_sentences: + python_outputs = tokenizer([test_inputs], return_tensors="tf") + tf_outputs = tf_tokenizer([test_inputs]) + + for key in python_outputs.keys(): + # convert them to numpy to avoid messing with ragged tensors + python_outputs_values = python_outputs[key].numpy() + tf_outputs_values = tf_outputs[key].numpy() + + self.assertTrue(tf.reduce_all(python_outputs_values.shape == tf_outputs_values.shape)) + self.assertTrue(tf.reduce_all(tf.cast(python_outputs_values, tf.int64) == tf_outputs_values)) + + @slow + def test_graph_mode(self): + for tf_tokenizer in self.tf_tokenizers: + compiled_tokenizer = tf.function(tf_tokenizer) + for test_inputs in self.test_sentences: + test_inputs = tf.constant(test_inputs) + compiled_outputs = compiled_tokenizer(test_inputs) + eager_outputs = tf_tokenizer(test_inputs) + + for key in eager_outputs.keys(): + self.assertTrue(tf.reduce_all(eager_outputs[key] == compiled_outputs[key])) + + @slow + def test_saved_model(self): + for tf_tokenizer in self.tf_tokenizers: + model = ModelToSave(tokenizer=tf_tokenizer) + test_inputs = tf.convert_to_tensor([self.test_sentences[0]]) + out = model.serving(test_inputs) # Build model with some sample inputs + with TemporaryDirectory() as tempdir: + save_path = Path(tempdir) / "saved.model" + tf.saved_model.save(model, save_path, signatures={"serving_default": model.serving}) + loaded_model = tf.saved_model.load(save_path) + loaded_output = loaded_model.signatures["serving_default"](test_inputs)["output_0"] + # We may see small differences because the loaded model is compiled, so we need an epsilon for the test + self.assertTrue(tf.reduce_all(out == loaded_output)) + + @slow + def test_from_config(self): + for tf_tokenizer in self.tf_tokenizers: + test_inputs = tf.convert_to_tensor([self.test_sentences[0]]) + out = tf_tokenizer(test_inputs) # Build model with some sample inputs + + config = tf_tokenizer.get_config() + model_from_config = TFGPT2Tokenizer.from_config(config) + from_config_output = model_from_config(test_inputs) + + for key in from_config_output.keys(): + self.assertTrue(tf.reduce_all(from_config_output[key] == out[key])) + + @slow + def test_padding(self): + for tf_tokenizer in self.tf_tokenizers: + # for the test to run + tf_tokenizer.pad_token_id = 123123 + + for max_length in [3, 5, 1024]: + test_inputs = tf.convert_to_tensor([self.test_sentences[0]]) + out = tf_tokenizer(test_inputs, max_length=max_length) + + out_length = out["input_ids"].numpy().shape[1] + + assert out_length == max_length diff --git a/docs/transformers/tests/models/gpt_bigcode/__init__.py b/docs/transformers/tests/models/gpt_bigcode/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/docs/transformers/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py new file mode 100644 index 0000000000000000000000000000000000000000..67cfb40ea17610762b69eca64bc46e6c9b491cc7 --- /dev/null +++ b/docs/transformers/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py @@ -0,0 +1,594 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import unittest + +from parameterized import parameterized + +from transformers import GPTBigCodeConfig, is_torch_available +from transformers.testing_utils import cleanup, require_torch, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + GPT2TokenizerFast, + GPTBigCodeForCausalLM, + GPTBigCodeForSequenceClassification, + GPTBigCodeForTokenClassification, + GPTBigCodeModel, + ) + from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeAttention + + +class GPTBigCodeModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="relu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + multi_query=True, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.use_mc_token_ids = use_mc_token_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = None + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 2 + self.pad_token_id = vocab_size - 3 + self.multi_query = multi_query + + def get_large_model_config(self): + return GPTBigCodeConfig.from_pretrained("bigcode/gpt_bigcode-santacoder") + + def prepare_config_and_inputs( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config( + gradient_checkpointing=gradient_checkpointing, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + ) + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def get_config( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + return GPTBigCodeConfig( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + n_inner=self.intermediate_size, + activation_function=self.hidden_act, + resid_pdrop=self.hidden_dropout_prob, + attn_pdrop=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + use_cache=True, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + gradient_checkpointing=gradient_checkpointing, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + attention_softmax_in_fp32=False, + scale_attention_softmax_in_fp32=False, + multi_query=self.multi_query, + ) + + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + + def create_and_check_gpt_bigcode_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTBigCodeModel(config=config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(len(result.past_key_values), config.n_layer) + + def create_and_check_gpt_bigcode_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTBigCodeModel(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) + outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids) + outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False) + + self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) + self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + + output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] + output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_gpt_bigcode_model_attention_mask_past( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = GPTBigCodeModel(config=config) + model.to(torch_device) + model.eval() + + # create attention mask + attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) + half_seq_length = self.seq_length // 2 + attn_mask[:, half_seq_length:] = 0 + + # first forward pass + output, past = model(input_ids, attention_mask=attn_mask).to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + # change a random masked slice from input_ids + random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1 + random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1) + input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens + + # append to next input_ids and attn_mask + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + attn_mask = torch.cat( + [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], + dim=1, + ) + + # get two different outputs + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_gpt_bigcode_model_past_large_inputs( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = GPTBigCodeModel(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model( + next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask + )["last_hidden_state"] + output_from_past = model( + next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past + )["last_hidden_state"] + self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1]) + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTBigCodeForCausalLM(config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_forward_and_backwards( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False + ): + model = GPTBigCodeForCausalLM(config) + model.to(torch_device) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + result.loss.backward() + + def create_and_check_gpt_bigcode_for_sequence_classification( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = GPTBigCodeForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_gpt_bigcode_for_token_classification( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = GPTBigCodeForTokenClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_gpt_bigcode_weight_initialization(self, config, *args): + model = GPTBigCodeModel(config) + model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer) + for key in model.state_dict().keys(): + if "c_proj" in key and "weight" in key: + self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001) + self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "head_mask": head_mask, + } + + return config, inputs_dict + + +@require_torch +class GPTBigCodeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + # TODO: Update the tests to use valid pretrained models. + all_model_classes = ( + ( + GPTBigCodeModel, + GPTBigCodeForCausalLM, + GPTBigCodeForSequenceClassification, + GPTBigCodeForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": GPTBigCodeModel, + "text-classification": GPTBigCodeForSequenceClassification, + "text-generation": GPTBigCodeForCausalLM, + "token-classification": GPTBigCodeForTokenClassification, + "zero-shot": GPTBigCodeForSequenceClassification, + } + if is_torch_available() + else {} + ) + fx_compatible = False + test_missing_keys = False + test_pruning = False + test_torchscript = False + multi_query = True + + # special case for DoubleHeads model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + return inputs_dict + + def setUp(self): + self.model_tester = GPTBigCodeModelTester(self, multi_query=self.multi_query) + self.config_tester = ConfigTester(self, config_class=GPTBigCodeConfig, n_embd=37) + + def tearDown(self): + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="MQA models does not support retain_grad") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="Contrastive search not supported due to non-standard caching mechanism") + def test_contrastive_generate(self): + pass + + @unittest.skip(reason="Contrastive search not supported due to non-standard caching mechanism") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip(reason="CPU offload seems to be broken for some reason - tiny models keep hitting corner cases") + def test_cpu_offload(self): + pass + + @unittest.skip(reason="Disk offload seems to be broken for some reason - tiny models keep hitting corner cases") + def test_disk_offload(self): + pass + + @unittest.skip(reason="BigCodeGPT has a non-standard KV cache format.") + def test_past_key_values_format(self): + pass + + @unittest.skip(reason="BigCodeGPT has a non-standard KV cache format and breaks this test.") + def test_generate_continue_from_inputs_embeds(self): + pass + + def test_gpt_bigcode_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_model(*config_and_inputs) + + def test_gpt_bigcode_model_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_model_past(*config_and_inputs) + + def test_gpt_bigcode_model_att_mask_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_model_attention_mask_past(*config_and_inputs) + + def test_gpt_bigcode_model_past_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_model_past_large_inputs(*config_and_inputs) + + def test_gpt_bigcode_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + def test_gpt_bigcode_sequence_classification_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_for_sequence_classification(*config_and_inputs) + + def test_gpt_bigcode_token_classification_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_for_token_classification(*config_and_inputs) + + def test_gpt_bigcode_gradient_checkpointing(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True) + + def test_gpt_bigcode_scale_attn_by_inverse_layer_idx(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs(scale_attn_by_inverse_layer_idx=True) + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs) + + def test_gpt_bigcode_reorder_and_upcast_attn(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs(reorder_and_upcast_attn=True) + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs) + + def test_gpt_bigcode_weight_initialization(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_bigcode_weight_initialization(*config_and_inputs) + + +@require_torch +class GPTBigCodeMHAModelTest(GPTBigCodeModelTest): + # `parameterized_class` breaks with mixins, so we use inheritance instead + multi_query = False + + +@slow +@require_torch +class GPTBigCodeModelLanguageGenerationTest(unittest.TestCase): + def test_generate_simple(self): + model = GPTBigCodeForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder").to(torch_device) + tokenizer = GPT2TokenizerFast.from_pretrained("bigcode/gpt_bigcode-santacoder") + + input_ids = tokenizer("def print_hello_world():", return_tensors="pt").input_ids.to(torch_device) + + output_sequence = model.generate(input_ids) + output_sentence = tokenizer.decode(output_sequence[0], skip_special_tokens=True) + + expected_output = """def print_hello_world():\n print("Hello World!")\n\n\ndef print_hello_""" + self.assertEqual(output_sentence, expected_output) + + def test_generate_batched(self): + tokenizer = GPT2TokenizerFast.from_pretrained("bigcode/gpt_bigcode-santacoder") + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + + model = GPTBigCodeForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder").to(torch_device) + + inputs = tokenizer(["def print_hello_world():", "def say_hello():"], return_tensors="pt", padding=True).to( + torch_device + ) + outputs = model.generate(**inputs) + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) + + expected_output = [ + 'def print_hello_world():\n print("Hello World!")\n\n\ndef print_hello_', + 'def say_hello():\n print("Hello, World!")\n\n\nsay_hello()', + ] + self.assertListEqual(outputs, expected_output) + + +@require_torch +class GPTBigCodeMQATest(unittest.TestCase): + def get_attention(self, multi_query): + config = GPTBigCodeConfig.from_pretrained( + "bigcode/gpt_bigcode-santacoder", + multi_query=multi_query, + attn_pdrop=0, + resid_pdrop=0, + ) + return GPTBigCodeAttention(config) + + @parameterized.expand([(seed, is_train_mode) for seed in range(5) for is_train_mode in [True, False]]) + def test_mqa_reduces_to_mha(self, seed, is_train_mode=True): + torch.manual_seed(seed) + + # CREATE MQA AND MHA ATTENTIONS + attention_mqa = self.get_attention(True) + attention_mha = self.get_attention(False) + + # ENFORCE MATCHING WEIGHTS + num_heads = attention_mqa.num_heads + embed_dim = attention_mqa.embed_dim + head_dim = attention_mqa.head_dim + + with torch.no_grad(): + mqa_q_weight = attention_mqa.c_attn.weight[:embed_dim, :].view(num_heads, 1, head_dim, embed_dim) + mqa_kv_weight = attention_mqa.c_attn.weight[embed_dim:, :].view(1, 2, head_dim, embed_dim) + mha_c_weight = torch.cat( + [mqa_q_weight, mqa_kv_weight.expand(num_heads, 2, head_dim, embed_dim)], dim=1 + ).view(3 * num_heads * head_dim, embed_dim) + + mqa_q_bias = attention_mqa.c_attn.bias[:embed_dim].view(num_heads, 1, head_dim) + mqa_kv_bias = attention_mqa.c_attn.bias[embed_dim:].view(1, 2, head_dim) + mha_c_bias = torch.cat([mqa_q_bias, mqa_kv_bias.expand(num_heads, 2, head_dim)], dim=1).view( + 3 * num_heads * head_dim + ) + + attention_mha.c_attn.weight.copy_(mha_c_weight) + attention_mha.c_attn.bias.copy_(mha_c_bias) + attention_mha.c_proj.weight.copy_(attention_mqa.c_proj.weight) + attention_mha.c_proj.bias.copy_(attention_mqa.c_proj.bias) + + # PUT THE MODEL INTO THE CORRECT MODE + attention_mha.train(is_train_mode) + attention_mqa.train(is_train_mode) + + # RUN AN INPUT THROUGH THE MODELS + num_tokens = 5 + hidden_states = torch.randn(1, num_tokens, embed_dim) + attention_mha_result = attention_mha(hidden_states)[0] + attention_mqa_result = attention_mqa(hidden_states)[0] + + # CHECK THAT ALL OUTPUTS ARE THE SAME + torch.testing.assert_close(attention_mha_result, attention_mqa_result, rtol=1e-5, atol=1e-5) diff --git a/docs/transformers/tests/models/gpt_neo/__init__.py b/docs/transformers/tests/models/gpt_neo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/docs/transformers/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py new file mode 100644 index 0000000000000000000000000000000000000000..abaadc2247e8d32d9da942b8ce567b1ad494b37e --- /dev/null +++ b/docs/transformers/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py @@ -0,0 +1,223 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers import GPT2Tokenizer, GPTNeoConfig, is_flax_available +from transformers.testing_utils import require_flax, slow + +from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask + + +if is_flax_available(): + import jax + import jax.numpy as jnp + + from transformers.models.gpt_neo.modeling_flax_gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel + + +class FlaxGPTNeoModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + attention_types=[[["global", "local"], 1]], + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + window_size=7, + initializer_range=0.02, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.attention_types = attention_types + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.window_size = window_size + self.initializer_range = initializer_range + self.scope = None + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + config = GPTNeoConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_layers=self.num_hidden_layers, + num_heads=self.num_attention_heads, + max_position_embeddings=self.max_position_embeddings, + use_cache=False, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + window_size=self.window_size, + attention_types=self.attention_types, + ) + + return (config, input_ids, input_mask) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + return config, inputs_dict + + def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask): + max_decoder_length = 20 + model = model_class_name(config) + + past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length) + attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4") + + position_ids = jnp.broadcast_to( + jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1) + ) + outputs_cache = model( + input_ids[:, :-1], + attention_mask=attention_mask, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4") + outputs_cache_next = model( + input_ids[:, -1:], + attention_mask=attention_mask, + past_key_values=outputs_cache.past_key_values, + position_ids=position_ids, + ) + + outputs = model(input_ids) + + diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])) + self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}") + + def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask): + max_decoder_length = 20 + model = model_class_name(config) + + attention_mask_cache = jnp.concatenate( + [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))], + axis=-1, + ) + + past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length) + position_ids = jnp.broadcast_to( + jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1) + ) + + outputs_cache = model( + input_ids[:, :-1], + attention_mask=attention_mask_cache, + past_key_values=past_key_values, + position_ids=position_ids, + ) + position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4") + outputs_cache_next = model( + input_ids[:, -1:], + past_key_values=outputs_cache.past_key_values, + attention_mask=attention_mask_cache, + position_ids=position_ids, + ) + + outputs = model(input_ids, attention_mask=attention_mask) + + diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])) + self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}") + + +@require_flax +class FlaxGPTNeoModelTest(FlaxModelTesterMixin, unittest.TestCase): + all_model_classes = (FlaxGPTNeoModel, FlaxGPTNeoForCausalLM) if is_flax_available() else () + + def setUp(self): + self.model_tester = FlaxGPTNeoModelTester(self) + + def test_use_cache_forward(self): + for model_class_name in self.all_model_classes: + config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask) + + def test_use_cache_forward_with_attn_mask(self): + for model_class_name in self.all_model_classes: + config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_use_cache_forward_with_attn_mask( + model_class_name, config, input_ids, attention_mask + ) + + @slow + def test_batch_generation(self): + tokenizer = GPT2Tokenizer.from_pretrained( + "openai-community/gpt2", pad_token="<|endoftext|>", padding_side="left" + ) + inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True) + + model = FlaxGPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M") + model.do_sample = False + model.config.pad_token_id = model.config.eos_token_id + + jit_generate = jax.jit(model.generate) + + output_sequences = jit_generate( + inputs["input_ids"], attention_mask=inputs["attention_mask"], pad_token_id=tokenizer.pad_token_id + ).sequences + + output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True) + + expected_string = [ + "Hello this is a long string of text.\n\nI'm trying to get the text of the", + "Hey, I'm a little late to the party. I'm going to", + ] + + self.assertListEqual(output_string, expected_string) + + @slow + def test_model_from_pretrained(self): + for model_class_name in self.all_model_classes: + model = model_class_name.from_pretrained("EleutherAI/gpt-neo-125M") + outputs = model(np.ones((1, 1))) + self.assertIsNotNone(outputs) diff --git a/docs/transformers/tests/models/gpt_neo/test_modeling_gpt_neo.py b/docs/transformers/tests/models/gpt_neo/test_modeling_gpt_neo.py new file mode 100644 index 0000000000000000000000000000000000000000..9384ab2f094628740edbf4e27228ba3d54188387 --- /dev/null +++ b/docs/transformers/tests/models/gpt_neo/test_modeling_gpt_neo.py @@ -0,0 +1,573 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch GPT Neo model.""" + +import unittest + +from transformers import GPTNeoConfig, is_torch_available +from transformers.testing_utils import require_torch, slow, torch_device +from transformers.utils import cached_property + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + GPT2Tokenizer, + GPTNeoForCausalLM, + GPTNeoForQuestionAnswering, + GPTNeoForSequenceClassification, + GPTNeoForTokenClassification, + GPTNeoModel, + ) + + +class GPTNeoModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + attention_types=[[["global", "local"], 1]], + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + window_size=7, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.use_mc_token_ids = use_mc_token_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.window_size = window_size + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + self.attention_types = attention_types + + def get_large_model_config(self): + return GPTNeoConfig.from_pretrained("gpt-neo-125M") + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def get_config(self): + return GPTNeoConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_layers=self.num_hidden_layers, + num_heads=self.num_attention_heads, + max_position_embeddings=self.max_position_embeddings, + use_cache=True, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + window_size=self.window_size, + attention_types=self.attention_types, + ) + + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + + def create_and_check_gpt_neo_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTNeoModel(config=config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + # past_key_values is not implemented + # self.parent.assertEqual(len(result.past_key_values), config.n_layer) + + def create_and_check_gpt_neo_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTNeoModel(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) + outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids) + outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False) + + self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) + self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + + output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] + output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_gpt_neo_model_attention_mask_past( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = GPTNeoModel(config=config) + model.to(torch_device) + model.eval() + + # create attention mask + attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) + half_seq_length = self.seq_length // 2 + attn_mask[:, half_seq_length:] = 0 + + # first forward pass + output, past = model(input_ids, attention_mask=attn_mask).to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + # change a random masked slice from input_ids + random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1 + random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1) + input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens + + # append to next input_ids and attn_mask + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + attn_mask = torch.cat( + [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], + dim=1, + ) + + # get two different outputs + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_gpt_neo_model_past_large_inputs( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = GPTNeoModel(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model( + next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask + )["last_hidden_state"] + output_from_past = model( + next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past + )["last_hidden_state"] + self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1]) + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTNeoForCausalLM(config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_gpt_neo_for_question_answering( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = GPTNeoForQuestionAnswering(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_gpt_neo_for_sequence_classification( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = GPTNeoForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_gpt_neo_for_token_classification( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = GPTNeoForTokenClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_forward_and_backwards( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False + ): + model = GPTNeoForCausalLM(config) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + model.to(torch_device) + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + result.loss.backward() + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "head_mask": head_mask, + } + + return config, inputs_dict + + +@require_torch +class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + GPTNeoModel, + GPTNeoForCausalLM, + GPTNeoForQuestionAnswering, + GPTNeoForSequenceClassification, + GPTNeoForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": GPTNeoModel, + "question-answering": GPTNeoForQuestionAnswering, + "text-classification": GPTNeoForSequenceClassification, + "text-generation": GPTNeoForCausalLM, + "token-classification": GPTNeoForTokenClassification, + "zero-shot": GPTNeoForSequenceClassification, + } + if is_torch_available() + else {} + ) + fx_compatible = True + test_missing_keys = False + test_pruning = False + test_model_parallel = False + + # special case for DoubleHeads model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + return inputs_dict + + def setUp(self): + self.model_tester = GPTNeoModelTester(self) + self.config_tester = ConfigTester(self, config_class=GPTNeoConfig, n_embd=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_gpt_neo_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_neo_model(*config_and_inputs) + + def test_gpt_neo_model_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_neo_model_past(*config_and_inputs) + + def test_gpt_neo_model_att_mask_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_neo_model_attention_mask_past(*config_and_inputs) + + def test_gpt_neo_model_past_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_neo_model_past_large_inputs(*config_and_inputs) + + def test_gpt_neo_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + def test_gpt_neo_question_answering_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_neo_for_question_answering(*config_and_inputs) + + def test_gpt_neo_sequence_classification_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_neo_for_sequence_classification(*config_and_inputs) + + def test_gpt_neo_token_classification_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gpt_neo_for_token_classification(*config_and_inputs) + + def test_gpt_neo_gradient_checkpointing(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True) + + def _get_hidden_states(self): + return torch.tensor( + [ + [ + [0.4983, -0.7584, -1.6944, 0.5440], + [2.6918, 0.4206, 0.4176, 0.2055], + [-0.0071, -0.0405, -1.4920, -0.3630], + [1.0492, 0.1599, -1.7648, 0.2419], + [-1.8348, 2.0514, -0.1946, 0.3203], + [0.7672, -1.1600, -1.7118, -0.9056], + [0.2986, 0.5372, 0.7729, -0.1927], + [0.0285, 0.2629, -1.1156, -1.1992], + ] + ], + dtype=torch.float32, + device=torch_device, + ) + + def test_local_attn_probs(self): + model = GPTNeoModel.from_pretrained("valhalla/gpt-neo-random-tiny").eval() + layer = model.h[1].attn.attention.to(torch_device) + hidden_states = self._get_hidden_states() + hidden_states = torch.cat([hidden_states, hidden_states - 0.5], dim=2) + + batch_size, seq_length, _ = hidden_states.shape + mask_tokens = 2 + attention_mask = torch.ones(batch_size, seq_length, device=torch_device, dtype=torch.long) + attention_mask[:, -mask_tokens:] = 0 # dont attend last mask_tokens + + attention_mask = attention_mask.view(batch_size, -1) + attention_mask = attention_mask[:, None, None, :] + attention_mask = (1.0 - attention_mask) * -10000.0 + + attn_probs = layer(hidden_states, attention_mask=attention_mask, output_attentions=True)[-1] + + # the last 2 tokens are masked, and should have 0 attn_probs + self.assertTrue(torch.all(attn_probs[:, :, -mask_tokens:, -mask_tokens:] == 0)) + + # in loacal attention each token can only attend to the previous window_size tokens (including itself) + # here window_size is 4, so a token at index 5 can only attend to indcies [2, 3, 4, 5] + # and the attn_probs should be 0 for token [0, 1] + self.assertTrue(torch.all(attn_probs[:, :, 5, 2:6] != 0)) + self.assertTrue(torch.all(attn_probs[:, :, 5, :2] == 0)) + + +@require_torch +class GPTNeoModelLanguageGenerationTest(unittest.TestCase): + @cached_property + def model(self): + return GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").to(torch_device) + + @cached_property + def tokenizer(self): + return GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B") + + @slow + def test_lm_generate_gpt_neo(self): + for checkpointing in [True, False]: + model = self.model + if checkpointing: + model.gradient_checkpointing_enable() + else: + model.gradient_checkpointing_disable() + input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device) # The dog + # The dog-eared copy of the book, which is a collection of essays by the late author, + expected_output_ids = [464, 3290, 12, 3380, 4866, 286, 262, 1492, 11, 543, 318, 257, 4947, 286, 27126, 416, 262, 2739, 1772, 11] # fmt: skip + output_ids = model.generate(input_ids, do_sample=False) + self.assertListEqual(output_ids[0].tolist(), expected_output_ids) + + @slow + def test_gpt_neo_sample(self): + model = self.model + tokenizer = self.tokenizer + + torch.manual_seed(0) + tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True) + input_ids = tokenized.input_ids.to(torch_device) + output_ids = model.generate(input_ids, do_sample=True) + output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True) + + EXPECTED_OUTPUT_STR = "Today is a nice day and if you don’t get the memo here is what you can" + self.assertEqual(output_str, EXPECTED_OUTPUT_STR) + + @slow + def test_batch_generation(self): + model = self.model + tokenizer = self.tokenizer + + tokenizer.padding_side = "left" + + # Define PAD Token = EOS Token = 50256 + tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = model.config.eos_token_id + + # use different length sentences to test batching + sentences = [ + "Hello, my dog is a little", + "Today, I am", + ] + + inputs = tokenizer(sentences, return_tensors="pt", padding=True) + input_ids = inputs["input_ids"].to(torch_device) + + outputs = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + ) + + inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device) + output_non_padded = model.generate(input_ids=inputs_non_padded) + + num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item() + inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device) + output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) + + batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) + non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) + padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) + + expected_output_sentence = [ + "Hello, my dog is a little bit of a kitty. She is a very sweet and loving", + "Today, I am going to talk about the best way to get a job in the", + ] + self.assertListEqual(expected_output_sentence, batch_out_sentence) + self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) + + @slow + def test_model_from_pretrained(self): + model_name = "EleutherAI/gpt-neo-1.3B" + model = GPTNeoModel.from_pretrained(model_name) + self.assertIsNotNone(model) diff --git a/docs/transformers/tests/models/gpt_neox/__init__.py b/docs/transformers/tests/models/gpt_neox/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/gpt_neox/test_modeling_gpt_neox.py b/docs/transformers/tests/models/gpt_neox/test_modeling_gpt_neox.py new file mode 100644 index 0000000000000000000000000000000000000000..33c79f2a7b12ad2a744b9f971b5ee1de477c0d69 --- /dev/null +++ b/docs/transformers/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -0,0 +1,483 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch GPTNeoX model.""" + +import unittest + +from parameterized import parameterized + +from transformers import AutoTokenizer, DynamicCache, GPTNeoXConfig, is_torch_available, set_seed +from transformers.testing_utils import require_torch, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + GPTNeoXForCausalLM, + GPTNeoXForQuestionAnswering, + GPTNeoXForSequenceClassification, + GPTNeoXForTokenClassification, + GPTNeoXModel, + ) + from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXRotaryEmbedding + + +class GPTNeoXModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=64, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.pad_token_id = vocab_size - 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_labels = None + if self.use_labels: + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + + config = self.get_config() + + return config, input_ids, input_mask, token_labels + + def get_config(self): + return GPTNeoXConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + ) + + def prepare_config_and_inputs_for_decoder(self): + config, input_ids, input_mask, token_labels = self.prepare_config_and_inputs() + + config.is_decoder = True + + return config, input_ids, input_mask, token_labels + + def create_and_check_model(self, config, input_ids, input_mask): + model = GPTNeoXModel(config=config) + model.to(torch_device) + model.eval() + _ = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_model_as_decoder(self, config, input_ids, input_mask): + config.add_cross_attention = True + model = GPTNeoXModel(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_causal_lm(self, config, input_ids, input_mask, token_labels): + model = GPTNeoXForCausalLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_question_answering(self, config, input_ids, input_mask, token_labels): + config.num_labels = self.num_labels + model = GPTNeoXForQuestionAnswering(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def create_and_check_for_sequence_classification(self, config, input_ids, input_mask, token_labels): + config.num_labels = self.num_labels + model = GPTNeoXForSequenceClassification(config) + model.to(torch_device) + model.eval() + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + result = model(input_ids, attention_mask=input_mask, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_token_classification(self, config, input_ids, input_mask, token_labels): + config.num_labels = self.num_labels + model = GPTNeoXForTokenClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_decoder_model_past_large_inputs(self, config, input_ids, input_mask): + config.is_decoder = True + model = GPTNeoXForCausalLM(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, attention_mask=input_mask, use_cache=True) + past_key_values = outputs.past_key_values + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask, output_hidden_states=True) + output_from_no_past = output_from_no_past["hidden_states"][0] + output_from_past = model( + next_tokens, + attention_mask=next_attention_mask, + past_key_values=past_key_values, + output_hidden_states=True, + )["hidden_states"][0] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_cached_forward_with_and_without_attention_mask(self, config, input_ids, *args): + # Relevant issue: https://github.com/huggingface/transformers/issues/31943 + model = GPTNeoXModel(config) + model.to(torch_device) + model.eval() + + # We want this for SDPA, eager works with a `None` attention mask + assert model.config._attn_implementation == "sdpa", ( + "This test assumes the model to have the SDPA implementation for its attention calculations." + ) + + # Prepare cache and non_cache input, needs a full attention mask + cached_len = input_ids.shape[-1] // 2 + input_mask = torch.ones(size=input_ids.size()).to(torch_device) + cache_inputs = {"input_ids": input_ids[:, :cached_len], "attention_mask": input_mask[:, :cached_len]} + non_cache_inputs = {"input_ids": input_ids[:, cached_len:], "attention_mask": input_mask} + + def copy_cache(cache: DynamicCache): + """Deep copy a DynamicCache to reuse the same one multiple times.""" + new_cache = cache + for i in range(len(cache)): + new_cache.key_cache[i] = cache.key_cache[i].clone() + new_cache.value_cache[i] = cache.value_cache[i].clone() + + # Cached forward once with the attention mask provided and the other time without it (which should assume full attention) + # We need to run both on a copy of the cache, otherwise it is modified in-place + cache_outputs = model(**cache_inputs) + cache = cache_outputs.past_key_values + full_outputs_with_attention_mask = model( + **non_cache_inputs, past_key_values=copy_cache(cache) + ).last_hidden_state + full_outputs_without_attention_mask = model( + non_cache_inputs["input_ids"], past_key_values=copy_cache(cache) + ).last_hidden_state + + self.parent.assertTrue( + torch.allclose(full_outputs_with_attention_mask, full_outputs_without_attention_mask, atol=1e-5) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, input_mask, token_labels = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + GPTNeoXModel, + GPTNeoXForCausalLM, + GPTNeoXForQuestionAnswering, + GPTNeoXForSequenceClassification, + GPTNeoXForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": GPTNeoXModel, + "question-answering": GPTNeoXForQuestionAnswering, + "text-classification": GPTNeoXForSequenceClassification, + "text-generation": GPTNeoXForCausalLM, + "token-classification": GPTNeoXForTokenClassification, + "zero-shot": GPTNeoXForSequenceClassification, + } + if is_torch_available() + else {} + ) + test_pruning = False + test_missing_keys = False + test_model_parallel = False + test_head_masking = False + + def setUp(self): + self.model_tester = GPTNeoXModelTester(self) + self.config_tester = ConfigTester(self, config_class=GPTNeoXConfig, hidden_size=64, num_attention_heads=8) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(config, input_ids, input_mask) + + def test_model_as_decoder(self): + config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_model_as_decoder(config, input_ids, input_mask) + + def test_model_as_decoder_with_default_input_mask(self): + config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs_for_decoder() + + input_mask = None + + self.model_tester.create_and_check_model_as_decoder(config, input_ids, input_mask) + + def test_decoder_model_past_large_inputs(self): + config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_decoder_model_past_large_inputs(config, input_ids, input_mask) + + def test_model_for_causal_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_causal_lm(*config_and_inputs) + + def test_model_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_model_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_model_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + def test_cached_forward_with_and_without_attention_mask(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_cached_forward_with_and_without_attention_mask(*config_and_inputs) + + @unittest.skip(reason="Feed forward chunking is not implemented") + def test_feed_forward_chunking(self): + pass + + @parameterized.expand([("linear",), ("dynamic",)]) + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model_rope_scaling_from_config with Llama->GPTNeoX + def test_model_rope_scaling_from_config(self, scaling_type): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + short_input = ids_tensor([1, 10], config.vocab_size) + long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + original_model = GPTNeoXModel(config) + original_model.to(torch_device) + original_model.eval() + original_short_output = original_model(short_input).last_hidden_state + original_long_output = original_model(long_input).last_hidden_state + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + config.rope_scaling = {"type": scaling_type, "factor": 10.0} + scaled_model = GPTNeoXModel(config) + scaled_model.to(torch_device) + scaled_model.eval() + scaled_short_output = scaled_model(short_input).last_hidden_state + scaled_long_output = scaled_model(long_input).last_hidden_state + + # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original + # maximum sequence length, so the outputs for the short input should match. + if scaling_type == "dynamic": + torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) + else: + self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) + + # The output should be different for long inputs + self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + + def test_model_rope_scaling(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + + # Inputs + x = torch.randn( + 1, dtype=torch.float32, device=torch_device + ) # used exclusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Sanity check original RoPE + original_rope = GPTNeoXRotaryEmbedding(config).to(torch_device) + original_cos_short, original_sin_short = original_rope(x, position_ids_short) + original_cos_long, original_sin_long = original_rope(x, position_ids_long) + torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) + + # Sanity check linear RoPE scaling + # New position "x" should match original position with index "x/scaling_factor" + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + linear_scaling_rope = GPTNeoXRotaryEmbedding(config).to(torch_device) + linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) + linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) + torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) + for new_position in range(0, long_input_length, scaling_factor): + original_position = int(new_position // scaling_factor) + torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) + torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) + + # Sanity check Dynamic NTK RoPE scaling + # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase + # with scaling_factor (or that `inv_freq` decreases) + config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} + ntk_scaling_rope = GPTNeoXRotaryEmbedding(config).to(torch_device) + ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) + ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) + torch.testing.assert_close(ntk_cos_short, original_cos_short) + torch.testing.assert_close(ntk_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_sin_long, original_sin_long) + self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) + + +@require_torch +class GPTNeoXLanguageGenerationTest(unittest.TestCase): + @slow + def test_lm_generate_gptneox(self): + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m-deduped") + for checkpointing in [True, False]: + model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-410m-deduped") + + if checkpointing: + model.gradient_checkpointing_enable() + else: + model.gradient_checkpointing_disable() + model.to(torch_device) + + inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device) + # The hub repo. is updated on 2023-04-04, resulting in poor outputs. + # See: https://github.com/huggingface/transformers/pull/24193 + expected_output = "My favorite food is a good old-fashioned, old-fashioned, old-fashioned.\n\nI'm not sure" + + output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20) + output_str = tokenizer.batch_decode(output_ids)[0] + + self.assertEqual(output_str, expected_output) + + @slow + def test_lm_generate_flex_attn_gptneox(self): + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m-deduped") + for checkpointing in [True, False]: + model = GPTNeoXForCausalLM.from_pretrained( + "EleutherAI/pythia-410m-deduped", attn_implementation="flex_attention" + ) + self.assertTrue(model.config._attn_implementation == "flex_attention") + + if checkpointing: + model.gradient_checkpointing_enable() + else: + model.gradient_checkpointing_disable() + model.to(torch_device) + + inputs = tokenizer("My favorite food is", return_tensors="pt").to(torch_device) + # The hub repo. is updated on 2023-04-04, resulting in poor outputs. + # See: https://github.com/huggingface/transformers/pull/24193 + expected_output = "My favorite food is a good old-fashioned, old-fashioned, old-fashioned.\n\nI'm not sure" + + output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20) + output_str = tokenizer.batch_decode(output_ids)[0] + + self.assertEqual(output_str, expected_output) + + def pythia_integration_test(self): + model_name_or_path = "EleutherAI/pythia-70m" + model = GPTNeoXForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16).to(torch_device) + EXPECTED_LOGITS = torch.tensor([1069.0000, 228.7500, 1072.0000, 1072.0000, 1069.0000, 1068.0000, 1068.0000, 1071.0000, 1071.0000, 1071.0000, 1073.0000, 1070.0000, 1071.0000, 1075.0000, 1073.0000, 1075.0000, 1074.0000, 1069.0000, 1072.0000, 1071.0000, 1071.0000, 1071.0000, 1070.0000, 1069.0000, 1069.0000, 1069.0000, 1070.0000, 1075.0000, 1073.0000, 1074.0000]) # fmt: skip + input_ids = [29, 93, 303, 64, 5478, 49651, 10394, 187, 34, 12939, 875] + # alternative: tokenizer('<|im_start|>system\nA chat between') + input_ids = torch.as_tensor(input_ids)[None].to(torch_device) + outputs = model(input_ids)["logits"][:, -1][0, :30] + torch.testing.assert_close(EXPECTED_LOGITS, outputs, rtol=1e-5, atol=1e-5) diff --git a/docs/transformers/tests/models/gpt_neox_japanese/__init__.py b/docs/transformers/tests/models/gpt_neox_japanese/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/docs/transformers/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py new file mode 100644 index 0000000000000000000000000000000000000000..168a0f2eebfbe5cf4cf01c0192135c58aff3f961 --- /dev/null +++ b/docs/transformers/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py @@ -0,0 +1,267 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch GPTNeoXJapanese model.""" + +import unittest + +from transformers import GPTNeoXJapaneseConfig, is_torch_available +from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import GPTNeoXJapaneseTokenizer +from transformers.testing_utils import require_torch, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import GPTNeoXJapaneseForCausalLM, GPTNeoXJapaneseModel + + +class GPTNeoXJapaneseModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_multiple_size=4, + hidden_act="gelu", + hidden_dropout=0.0, + attention_dropout=0.1, + weight_tying=True, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + bos_token_id=1, + eos_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_multiple_size = intermediate_multiple_size + self.hidden_act = hidden_act + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.weight_tying = weight_tying + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + self.eos_token_id = eos_token_id + self.bos_token_id = bos_token_id + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_labels = None + if self.use_labels: + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + + config = self.get_config() + + return config, input_ids, input_mask, token_labels + + def get_config(self): + return GPTNeoXJapaneseConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_multiple_size=self.intermediate_multiple_size, + hidden_act=self.hidden_act, + hidden_dropout=self.hidden_dropout, + attention_dropout=self.attention_dropout, + weight_tying=self.weight_tying, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + eos_token_id=self.eos_token_id, + bos_token_id=self.bos_token_id, + ) + + def prepare_config_and_inputs_for_decoder(self): + config, input_ids, input_mask, token_labels = self.prepare_config_and_inputs() + + config.is_decoder = True + + return config, input_ids, input_mask, token_labels + + def create_and_check_model(self, config, input_ids, input_mask): + model = GPTNeoXJapaneseModel(config=config) + model.to(torch_device) + model.eval() + _ = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_model_as_decoder(self, config, input_ids, input_mask): + config.add_cross_attention = True + model = GPTNeoXJapaneseModel(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_causal_lm(self, config, input_ids, input_mask, token_labels): + model = GPTNeoXJapaneseForCausalLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_decoder_model_past_large_inputs(self, config, input_ids, input_mask): + config.is_decoder = True + model = GPTNeoXJapaneseForCausalLM(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, attention_mask=input_mask, use_cache=True) + past_key_values = outputs.past_key_values + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask, output_hidden_states=True) + output_from_no_past = output_from_no_past["hidden_states"][0] + output_from_past = model( + next_tokens, + attention_mask=next_attention_mask, + past_key_values=past_key_values, + output_hidden_states=True, + )["hidden_states"][0] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, input_mask, token_labels = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class GPTNeoXModelJapaneseTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (GPTNeoXJapaneseModel, GPTNeoXJapaneseForCausalLM) if is_torch_available() else () + pipeline_model_mapping = ( + {"feature-extraction": GPTNeoXJapaneseModel, "text-generation": GPTNeoXJapaneseForCausalLM} + if is_torch_available() + else {} + ) + test_pruning = False + test_missing_keys = False + test_model_parallel = False + test_head_masking = False + + def setUp(self): + self.model_tester = GPTNeoXJapaneseModelTester(self) + self.config_tester = ConfigTester(self, config_class=GPTNeoXJapaneseConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(config, input_ids, input_mask) + + def test_model_as_decoder(self): + config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs_for_decoder() + self.model_tester.create_and_check_model_as_decoder(config, input_ids, input_mask) + + def test_model_as_decoder_with_default_input_mask(self): + config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs_for_decoder() + + input_mask = None + + self.model_tester.create_and_check_model_as_decoder(config, input_ids, input_mask) + + def test_decoder_model_past_large_inputs(self): + config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_decoder_model_past_large_inputs(config, input_ids, input_mask) + + def test_model_for_causal_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_causal_lm(*config_and_inputs) + + @slow + def test_generation(self): + model_id = "abeja/gpt-neox-japanese-2.7b" + + prompts = ["データサイエンティストとは、", "100年後に必要とされる会社は、", "フルリモートの環境で働くために必要なことは、", "国境の長いトンネルを抜けると", "美味しい日本食といえば、"] # fmt: skip + + EXPECTED_OUTPUTS = [ + "データサイエンティストとは、データを分析し、ビジネスに役立つ知見を導き出す専門家のことです。", + "100年後に必要とされる会社は、「人」が中心の会社です。", + "フルリモートの環境で働くために必要なことは、「自分の時間をコントロールする」ことです。", + "国境の長いトンネルを抜けると、そこは雪国だった。", + "美味しい日本食といえば、やっぱりお寿司ですよね。", + ] + + tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained(model_id) + model = GPTNeoXJapaneseForCausalLM.from_pretrained(model_id) + + predicted_outputs = [] + for prompt in prompts: + input_ids = tokenizer(prompt, return_tensors="pt").input_ids + generated_ids = model.generate(input_ids, max_length=50) + generated_string = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + predicted_outputs += generated_string + self.assertListEqual(predicted_outputs, EXPECTED_OUTPUTS) + + @unittest.skip("GPTNeoXJapanese applies bias to attention scores") + def test_custom_4d_attention_mask(self): + pass diff --git a/docs/transformers/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py b/docs/transformers/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py new file mode 100644 index 0000000000000000000000000000000000000000..483708c589c1940611cab77450ba0e1580cf87e0 --- /dev/null +++ b/docs/transformers/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py @@ -0,0 +1,143 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import unittest +from functools import lru_cache + +from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import ( + VOCAB_FILES_NAMES, + GPTNeoXJapaneseTokenizer, +) +from transformers.testing_utils import require_tokenizers, slow + +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible + + +@require_tokenizers +class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "abeja/gpt-neox-japanese-2.7b" + tokenizer_class = GPTNeoXJapaneseTokenizer + test_rust_tokenizer = False + from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False} + + @classmethod + def setUpClass(cls): + super().setUpClass() + + vocab_tokens = [ + "こん", + "こんに", + "にちは", + "ばんは", + "世界,㔺界", + "、", + "。", + "
", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "<|emoji1|>", + "", + "<|startoftext|>", + "<|endoftext|>", + ] + emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}} # 😀 + cls.special_tokens_map = {"unk_token": ""} + + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.emoji_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["emoji_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + with open(cls.emoji_file, "w") as emoji_writer: + emoji_writer.write(json.dumps(emoji_tokens)) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return GPTNeoXJapaneseTokenizer.from_pretrained(pretrained_name, **kwargs) + + def get_input_output_texts(self, tokenizer): + input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀" + output_text = "こんにちは、世界。 \nこんばんは、世界。😀" + return input_text, output_text + + def get_clean_sequence(self, tokenizer): + input_text, output_text = self.get_input_output_texts(tokenizer) + ids = tokenizer.encode(output_text, add_special_tokens=False) + text = tokenizer.decode(ids, clean_up_tokenization_spaces=False) + return text, ids + + def test_pretokenized_inputs(self): + pass # TODO add if relevant + + def test_maximum_encoding_length_pair_input(self): + pass # TODO add if relevant + + def test_maximum_encoding_length_single_input(self): + pass # TODO add if relevant + + def test_full_tokenizer(self): + tokenizer = self.get_tokenizer() + + # Testing tokenization + input_text = "こんにちは、世界。 こんばんは、㔺界。" + expected_token = ["こん", "にちは", "、", "世界", "。", "", "こん", "ばんは", "、", "㔺界", "。"] + tokens = tokenizer.tokenize(input_text) + self.assertListEqual(tokens, expected_token) + + # Testing conversion to ids without special tokens + expected_ids = [0, 2, 5, 4, 6, 8, 0, 3, 5, 4, 6] + input_ids = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual(input_ids, expected_ids) + + # Testing conversion to ids with special tokens + input_tokens = tokens + [tokenizer.unk_token] + expected_ids = [0, 2, 5, 4, 6, 8, 0, 3, 5, 4, 6, 19] + input_ids = tokenizer.convert_tokens_to_ids(input_tokens) + self.assertListEqual(input_ids, expected_ids) + + @slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained("abeja/gpt-neox-japanese-2.7b") + + ids_1 = tokenizer.encode("ありがとう。", add_special_tokens=False) + ids_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False) + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(ids_1) + encoded_pair = tokenizer.build_inputs_with_special_tokens(ids_1, ids_2) + + assert encoded_sentence == ids_1 + assert encoded_pair == ids_1 + ids_2 + + @unittest.skip + def test_conversion_reversible(self): + # Intentionally convert some words to accommodate character fluctuations unique to Japanese + pass + + @unittest.skip(reason="tokenizer has no padding token") + def test_padding_different_model_input_name(self): + pass diff --git a/docs/transformers/tests/models/gpt_sw3/__init__.py b/docs/transformers/tests/models/gpt_sw3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/docs/transformers/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py new file mode 100644 index 0000000000000000000000000000000000000000..4a1a3292c5bf569be7826576848bf3f4f98ee18c --- /dev/null +++ b/docs/transformers/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py @@ -0,0 +1,162 @@ +# Copyright 2022 Hugging Face inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers import GPTSw3Tokenizer +from transformers.testing_utils import get_tests_dir, require_jinja, require_sentencepiece, require_tokenizers, slow + +from ...test_tokenization_common import TokenizerTesterMixin + + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_with_bytefallback.model") + + +@require_sentencepiece +@require_tokenizers +class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "AI-Sweden-Models/gpt-sw3-126m" + tokenizer_class = GPTSw3Tokenizer + test_rust_tokenizer = False + test_sentencepiece = True + test_sentencepiece_ignore_case = False + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # We have a SentencePiece fixture for testing + tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, eos_token="", bos_token="", pad_token="") + + tokenizer.save_pretrained(cls.tmpdirname) + + def get_input_output_texts(self, tokenizer): + input_text = "This is a test" + output_text = "This is a test" + return input_text, output_text + + def test_convert_token_and_id(self): + """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" + token = "" + token_id = 1 + + self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id) + self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token) + + def test_get_vocab(self): + vocab_keys = list(self.get_tokenizer().get_vocab().keys()) + + self.assertEqual(vocab_keys[0], "") + self.assertEqual(vocab_keys[1], "") + self.assertEqual(vocab_keys[-1], "j") + self.assertEqual(len(vocab_keys), 2_000) + + def test_vocab_size(self): + self.assertEqual(self.get_tokenizer().vocab_size, 2_000) + + def test_full_tokenizer(self): + tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB) + + tokens = tokenizer.tokenize("This is a test") + self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) + + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [465, 287, 265, 631, 842]) + + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") + # fmt: off + self.assertListEqual( + tokens, + ["▁I", "▁was", "▁bor", "n", "▁in", "▁", "<0x39>", "2", "0", "0", "0", ",", "▁and", "▁this", "▁is", "▁f", "al", "s", "<0xC3>", "<0xA9>", "."], + ) + # fmt: on + + ids = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual( + ids, + [262, 272, 1525, 286, 271, 268, 60, 916, 633, 633, 633, 259, 266, 301, 287, 384, 367, 263, 198, 172, 260], + ) + + back_tokens = tokenizer.convert_ids_to_tokens(ids) + # fmt: off + self.assertListEqual( + back_tokens, + ["▁I", "▁was", "▁bor", "n", "▁in", "▁", "<0x39>", "2", "0", "0", "0", ",", "▁and", "▁this", "▁is", "▁f", "al", "s", "<0xC3>", "<0xA9>", "."] + ) + # fmt: on + + def test_fast_encode_decode(self): + tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB) + texts = ["This is a test", "I was born in 92000, and this is falsé."] + expected_ids_list = [ + [465, 287, 265, 631, 842], + [262, 272, 1525, 286, 271, 268, 60, 916, 633, 633, 633, 259, 266, 301, 287, 384, 367, 263, 198, 172, 260], + ] + + # Test that encode_fast returns the same as tokenize + convert_tokens_to_ids + for text, expected_ids in zip(texts, expected_ids_list): + self.assertListEqual(tokenizer.encode_fast(text), expected_ids) + + # Test that decode_fast returns the input text + for text, token_ids in zip(texts, expected_ids_list): + self.assertEqual(tokenizer.decode_fast(token_ids), text) + + @slow + def test_tokenizer_integration(self): + sequences = [ + "<|python|>def fibonacci(n)\n if n < 0:\n print('Incorrect input')", + "Hey there, how are you doing this fine day?", + "This is a text with a trailing spaces followed by a dot .", + "Häj sväjs lillebrör! =)", + "Det är inget fel på Mr. Cool", + ] + + expected_encoding = {"input_ids": [[63423, 5, 6811, 14954, 282, 816, 3821, 63466, 63425, 63462, 18, 63978, 678, 301, 1320, 63423, 63455, 63458, 18, 63982, 4246, 3940, 1901, 47789, 5547, 18994], [19630, 1100, 63446, 1342, 633, 544, 4488, 593, 5102, 2416, 63495, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1652, 428, 268, 1936, 515, 268, 58593, 22413, 9106, 546, 268, 33213, 63979, 698, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [55130, 63450, 924, 63449, 2249, 4062, 1558, 318, 63504, 21498, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [509, 377, 2827, 2559, 332, 6575, 63443, 26801, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip + self.tokenizer_integration_test_util( + expected_encoding=expected_encoding, + model_name="AI-Sweden-Models/gpt-sw3-126m", + sequences=sequences, + ) + + @require_jinja + def test_tokenization_for_chat(self): + tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB) + tokenizer.chat_template = ( + "{{ eos_token }}{{ bos_token }}" + "{% for message in messages %}" + "{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}" + "{% else %}{{ 'Bot: ' + message['content']}}{% endif %}" + "{{ message['text'] }}{{ bos_token }}" + "{% endfor %}" + "Bot:" + ) + # This is in English, but it's just here to make sure the chat control tokens are being added properly + test_chats = [ + [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], + [ + {"role": "system", "content": "You are a helpful chatbot."}, + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Nice to meet you."}, + ], + [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}], + ] + tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] + # fmt: off + expected_tokens = [ + [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419], + [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 575, 541, 419], + [2000, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419] + ] + # fmt: on + for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): + self.assertListEqual(tokenized_chat, expected_tokens) diff --git a/docs/transformers/tests/models/gptj/__init__.py b/docs/transformers/tests/models/gptj/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/gptj/test_modeling_flax_gptj.py b/docs/transformers/tests/models/gptj/test_modeling_flax_gptj.py new file mode 100644 index 0000000000000000000000000000000000000000..f92c07ab6e9574b97c0bb4ef2ac64eb4ef6d58cc --- /dev/null +++ b/docs/transformers/tests/models/gptj/test_modeling_flax_gptj.py @@ -0,0 +1,220 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers import GPT2Tokenizer, GPTJConfig, is_flax_available +from transformers.testing_utils import require_flax, tooslow + +from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask + + +if is_flax_available(): + import jax + import jax.numpy as jnp + + from transformers.models.gptj.modeling_flax_gptj import FlaxGPTJForCausalLM, FlaxGPTJModel + + +class FlaxGPTJModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + rotary_dim=4, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.rotary_dim = rotary_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = None + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + config = GPTJConfig( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + n_positions=self.max_position_embeddings, + use_cache=False, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + rotary_dim=self.rotary_dim, + ) + + return (config, input_ids, input_mask) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + return config, inputs_dict + + def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask): + max_decoder_length = 20 + model = model_class_name(config) + + past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length) + attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4") + + position_ids = jnp.broadcast_to( + jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1) + ) + outputs_cache = model( + input_ids[:, :-1], + attention_mask=attention_mask, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4") + outputs_cache_next = model( + input_ids[:, -1:], + attention_mask=attention_mask, + past_key_values=outputs_cache.past_key_values, + position_ids=position_ids, + ) + + outputs = model(input_ids) + + diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])) + self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}") + + def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask): + max_decoder_length = 20 + model = model_class_name(config) + + attention_mask_cache = jnp.concatenate( + [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))], + axis=-1, + ) + + past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length) + position_ids = jnp.broadcast_to( + jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1) + ) + + outputs_cache = model( + input_ids[:, :-1], + attention_mask=attention_mask_cache, + past_key_values=past_key_values, + position_ids=position_ids, + ) + position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4") + outputs_cache_next = model( + input_ids[:, -1:], + past_key_values=outputs_cache.past_key_values, + attention_mask=attention_mask_cache, + position_ids=position_ids, + ) + + outputs = model(input_ids, attention_mask=attention_mask) + + diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])) + self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}") + + +@require_flax +class FlaxGPTJModelTest(FlaxModelTesterMixin, unittest.TestCase): + all_model_classes = (FlaxGPTJModel, FlaxGPTJForCausalLM) if is_flax_available() else () + + def setUp(self): + self.model_tester = FlaxGPTJModelTester(self) + + def test_use_cache_forward(self): + for model_class_name in self.all_model_classes: + config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask) + + def test_use_cache_forward_with_attn_mask(self): + for model_class_name in self.all_model_classes: + config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_use_cache_forward_with_attn_mask( + model_class_name, config, input_ids, attention_mask + ) + + @tooslow + def test_batch_generation(self): + tokenizer = GPT2Tokenizer.from_pretrained( + "openai-community/gpt2", pad_token="<|endoftext|>", padding_side="left" + ) + inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True) + + model = FlaxGPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B") + model.do_sample = False + model.config.pad_token_id = model.config.eos_token_id + + jit_generate = jax.jit(model.generate) + + output_sequences = jit_generate( + inputs["input_ids"], attention_mask=inputs["attention_mask"], pad_token_id=tokenizer.pad_token_id + ).sequences + + output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True) + + expected_string = [ + "Hello this is a long string of text.\n\nI'm trying to get the text of the", + "Hey, I'm a little late to the party. I'm going to", + ] + + self.assertListEqual(output_string, expected_string) + + @tooslow + def test_model_from_pretrained(self): + for model_class_name in self.all_model_classes: + model = model_class_name.from_pretrained("EleutherAI/gpt-j-6B") + outputs = model(np.ones((1, 1))) + self.assertIsNotNone(outputs) diff --git a/docs/transformers/tests/models/gptj/test_modeling_gptj.py b/docs/transformers/tests/models/gptj/test_modeling_gptj.py new file mode 100644 index 0000000000000000000000000000000000000000..9614c5de1acb3ba31a73c693f5a2ad15d4579a71 --- /dev/null +++ b/docs/transformers/tests/models/gptj/test_modeling_gptj.py @@ -0,0 +1,579 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import GPTJConfig, is_torch_available +from transformers.testing_utils import ( + require_torch, + slow, + tooslow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + AutoTokenizer, + GPTJForCausalLM, + GPTJForQuestionAnswering, + GPTJForSequenceClassification, + GPTJModel, + ) + + +class GPTJModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + rotary_dim=4, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.use_mc_token_ids = use_mc_token_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.rotary_dim = rotary_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = None + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + def get_large_model_config(self): + return GPTJConfig.from_pretrained("EleutherAI/gpt-j-6B") + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def get_config(self): + return GPTJConfig( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + use_cache=True, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + rotary_dim=self.rotary_dim, + ) + + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + + def create_and_check_gptj_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTJModel(config=config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(len(result.past_key_values), config.n_layer) + + def create_and_check_gptj_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTJModel(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) + outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids) + outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False) + + self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) + self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + + output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] + output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_gptj_model_attention_mask_past( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = GPTJModel(config=config) + model.to(torch_device) + model.eval() + + # create attention mask + attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) + half_seq_length = self.seq_length // 2 + attn_mask[:, half_seq_length:] = 0 + + # first forward pass + output, past = model(input_ids, attention_mask=attn_mask).to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + # change a random masked slice from input_ids + random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1 + random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1) + input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens + + # append to next input_ids and attn_mask + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + attn_mask = torch.cat( + [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)], + dim=1, + ) + + # get two different outputs + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_gptj_model_past_large_inputs( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = GPTJModel(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True) + + output, past = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and token_type_ids + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model( + next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask + )["last_hidden_state"] + output_from_past = model( + next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past + )["last_hidden_state"] + self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1]) + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = GPTJForCausalLM(config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_forward_and_backwards( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False + ): + model = GPTJForCausalLM(config) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + model.to(torch_device) + + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + result.loss.backward() + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask} + + return config, inputs_dict + + +@require_torch +class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + (GPTJModel, GPTJForCausalLM, GPTJForSequenceClassification, GPTJForQuestionAnswering) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": GPTJModel, + "question-answering": GPTJForQuestionAnswering, + "text-classification": GPTJForSequenceClassification, + "text-generation": GPTJForCausalLM, + "zero-shot": GPTJForSequenceClassification, + } + if is_torch_available() + else {} + ) + fx_compatible = True + test_pruning = False + test_missing_keys = False + test_model_parallel = False + test_head_masking = False + + def test_torch_fx(self): + super().test_torch_fx() + + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + + # TODO: Fix the failed tests + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + if ( + pipeline_test_case_name == "QAPipelineTests" + and tokenizer_name is not None + and not tokenizer_name.endswith("Fast") + ): + # `QAPipelineTests` fails for a few models when the slower tokenizer are used. + # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework) + # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer + return True + + return False + + # special case for DoubleHeads model + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + return inputs_dict + + def setUp(self): + self.model_tester = GPTJModelTester(self) + self.config_tester = ConfigTester(self, config_class=GPTJConfig, n_embd=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_gptj_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gptj_model(*config_and_inputs) + + def test_gptj_model_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gptj_model_past(*config_and_inputs) + + def test_gptj_model_att_mask_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gptj_model_attention_mask_past(*config_and_inputs) + + def test_gptj_model_past_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gptj_model_past_large_inputs(*config_and_inputs) + + def test_gptj_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + def test_gptj_gradient_checkpointing(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True) + + @tooslow + def test_batch_generation(self): + # Marked as @tooslow due to GPU OOM + model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16) + model.to(torch_device) + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16") + + tokenizer.padding_side = "left" + + # Define PAD Token = EOS Token = 50256 + tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = model.config.eos_token_id + + # use different length sentences to test batching + sentences = [ + "Hello, my dog is a little", + "Today, I", + ] + + inputs = tokenizer(sentences, return_tensors="pt", padding=True) + input_ids = inputs["input_ids"].to(torch_device) + token_type_ids = torch.cat( + [ + input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0), + input_ids.new_full((input_ids.shape[0], 1), 500), + ], + dim=-1, + ) + + outputs = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + ) + + outputs_tt = model.generate( + input_ids=input_ids, + attention_mask=inputs["attention_mask"].to(torch_device), + token_type_ids=token_type_ids, + ) + + inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device) + output_non_padded = model.generate(input_ids=inputs_non_padded) + + num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item() + inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device) + output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) + + batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) + batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True) + non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) + padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) + + expected_output_sentence = [ + "Hello, my dog is a little over a year old and has been diagnosed with a heart murmur", + "Today, I’m going to talk about the most important thing in the", + ] + self.assertListEqual(expected_output_sentence, batch_out_sentence) + self.assertTrue(batch_out_sentence_tt != batch_out_sentence) # token_type_ids should change output + self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) + + @slow + def test_model_from_pretrained(self): + model_name = "EleutherAI/gpt-j-6B" + model = GPTJModel.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16) + self.assertIsNotNone(model) + + +@require_torch +class GPTJModelLanguageGenerationTest(unittest.TestCase): + @tooslow + def test_lm_generate_gptj(self): + # Marked as @tooslow due to GPU OOM + for checkpointing in [True, False]: + model = GPTJForCausalLM.from_pretrained( + "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16 + ) + if checkpointing: + model.gradient_checkpointing_enable() + else: + model.gradient_checkpointing_disable() + model.to(torch_device) + input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device) # The dog + # The dog is a man's best friend. It is a loyal companion, and it is a friend + expected_output_ids = [464, 3290, 318, 257, 582, 338, 1266, 1545, 13, 632, 318, 257, 9112, 15185, 11, 290, 340, 318, 257, 1545] # fmt: skip + output_ids = model.generate(input_ids, do_sample=False) + self.assertListEqual(output_ids[0].tolist(), expected_output_ids) + + @tooslow + def test_gptj_sample(self): + # Marked as @tooslow due to GPU OOM (issue #13676) + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16") + model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16) + model.to(torch_device) + + torch.manual_seed(0) + tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True) + input_ids = tokenized.input_ids.to(torch_device) + output_ids = model.generate(input_ids, do_sample=True) + output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True) + + token_type_ids = tokenized.token_type_ids.to(torch_device) + output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5) + output_seq_tt = model.generate( + input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5 + ) + output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True) + output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True) + + if torch_device != "cpu": + # currently this expect value is only for `cuda` + EXPECTED_OUTPUT_STR = ( + "Today is a nice day and I've already been enjoying it. I walked to work with my wife" + ) + else: + EXPECTED_OUTPUT_STR = "Today is a nice day and one of those days that feels a bit more alive. I am ready" + + self.assertEqual(output_str, EXPECTED_OUTPUT_STR) + self.assertTrue( + all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))) + ) # token_type_ids should change output + + @tooslow + def test_contrastive_search_gptj(self): + article = ( + "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and " + "research laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based" + ) + + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") + model = GPTJForCausalLM.from_pretrained( + "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16 + ).to(torch_device) + input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device) + + outputs = model.generate(input_ids, penalty_alpha=0.6, top_k=4, max_length=256) + generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True) + + self.assertListEqual( + generated_text, + [ + "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research " + "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based in London, " + "United Kingdom with offices in Mountain View, San Francisco, New York City, Paris, Tokyo, Seoul, " + "Beijing, Singapore, Tel Aviv, Dublin, Sydney, and Melbourne.[1]\n\nContents\n\nIn 2010, Google's " + "parent company, Alphabet, announced a $500 million investment in DeepMind, with the aim of creating " + "a company that would apply deep learning to problems in healthcare, energy, transportation, and " + "other areas.[2]\n\nOn April 23, 2014, Google announced that it had acquired DeepMind for $400 " + "million in cash and stock.[3] The acquisition was seen as a way for Google to enter the " + "fast-growing field of artificial intelligence (AI), which it had so far avoided due to concerns " + 'about ethical and social implications.[4] Google co-founder Sergey Brin said that he was "thrilled" ' + 'to have acquired DeepMind, and that it would "help us push the boundaries of AI even further."' + "[5]\n\nDeepMind's founders, Demis Hassabis and Mustafa Suleyman, were joined by a number of Google " + "employees" + ], + ) diff --git a/docs/transformers/tests/models/gptj/test_modeling_tf_gptj.py b/docs/transformers/tests/models/gptj/test_modeling_tf_gptj.py new file mode 100644 index 0000000000000000000000000000000000000000..2103dd9c267933921e8b0046908e60bdebf3a890 --- /dev/null +++ b/docs/transformers/tests/models/gptj/test_modeling_tf_gptj.py @@ -0,0 +1,468 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import unittest + +from transformers import AutoTokenizer, GPTJConfig, is_tf_available +from transformers.testing_utils import require_tf, slow, tooslow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin +from ...utils.test_modeling_tf_core import TFCoreModelTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers.models.gptj.modeling_tf_gptj import ( + TFGPTJForCausalLM, + TFGPTJForQuestionAnswering, + TFGPTJForSequenceClassification, + TFGPTJModel, + shape_list, + ) + + +class TFGPTJModelTester: + def __init__(self, parent): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_token_type_ids = True + self.use_input_mask = True + self.use_labels = True + self.use_mc_token_ids = True + self.vocab_size = 99 + self.hidden_size = 32 + self.rotary_dim = 4 + self.num_hidden_layers = 2 + self.num_attention_heads = 4 + self.intermediate_size = 37 + self.hidden_act = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.scope = None + self.bos_token_id = self.vocab_size - 1 + self.eos_token_id = self.vocab_size - 1 + self.pad_token_id = self.vocab_size - 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = GPTJConfig( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + rotary_dim=self.rotary_dim, + return_dict=True, + ) + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def create_and_check_gptj_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = TFGPTJModel(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + + inputs = [input_ids, None, input_mask] # None is the input for 'past' + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_gptj_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = TFGPTJModel(config=config) + + # first forward pass + outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) + outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids) + outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False) + + self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) + self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) + + output, past_key_values = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size) + + # append to next input_ids and token_type_ids + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1) + + output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] + output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past_key_values)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx] + output_from_past_slice = output_from_past[:, 0, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6) + + def create_and_check_gptj_model_attention_mask_past( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = TFGPTJModel(config=config) + + # create attention mask + half_seq_length = self.seq_length // 2 + attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32) + attn_mask_end = tf.zeros((self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32) + attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1) + + # first forward pass + output, past_key_values = model(input_ids, attention_mask=attn_mask).to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) + + # change a random masked slice from input_ids + random_seq_idx_to_change = ids_tensor((1,), half_seq_length).numpy() + 1 + random_other_next_tokens = ids_tensor((self.batch_size, self.seq_length), config.vocab_size) + vector_condition = tf.range(self.seq_length) == (self.seq_length - random_seq_idx_to_change) + condition = tf.transpose( + tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size)) + ) + input_ids = tf.where(condition, random_other_next_tokens, input_ids) + + # append to next input_ids and attn_mask + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1) + + # get two different outputs + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) + output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx] + output_from_past_slice = output_from_past[:, 0, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12) + + def create_and_check_gptj_model_past_large_inputs( + self, config, input_ids, input_mask, head_mask, token_type_ids, *args + ): + model = TFGPTJModel(config=config) + + input_ids = input_ids[:1, :] + input_mask = input_mask[:1, :] + token_type_ids = token_type_ids[:1, :] + self.batch_size = 1 + + # first forward pass + outputs = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, use_cache=True) + + output, past_key_values = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_attn_mask = ids_tensor((self.batch_size, 3), 2) + next_token_types = ids_tensor((self.batch_size, 3), self.type_vocab_size) + + # append to next input_ids and token_type_ids + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1) + next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1) + + output_from_no_past = model( + next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask + )["last_hidden_state"] + output_from_past = model( + next_tokens, + token_type_ids=next_token_types, + attention_mask=next_attention_mask, + past_key_values=past_key_values, + )["last_hidden_state"] + self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1]) + + # select random slice + random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx] + output_from_past_slice = output_from_past[:, :, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3) + + def create_and_check_gptj_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = TFGPTJForCausalLM(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "attention_mask": input_mask, + } + return config, inputs_dict + + +@require_tf +class TFGPTJModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + (TFGPTJForCausalLM, TFGPTJForSequenceClassification, TFGPTJForQuestionAnswering, TFGPTJModel) + if is_tf_available() + else () + ) + + all_generative_model_classes = (TFGPTJForCausalLM,) if is_tf_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": TFGPTJModel, + "question-answering": TFGPTJForQuestionAnswering, + "text-classification": TFGPTJForSequenceClassification, + "text-generation": TFGPTJForCausalLM, + "zero-shot": TFGPTJForSequenceClassification, + } + if is_tf_available() + else {} + ) + test_onnx = False + test_pruning = False + test_missing_keys = False + test_head_masking = False + + # TODO: Fix the failed tests + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + if ( + pipeline_test_case_name == "QAPipelineTests" + and tokenizer_name is not None + and not tokenizer_name.endswith("Fast") + ): + # `QAPipelineTests` fails for a few models when the slower tokenizer are used. + # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework) + # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer + return True + + return False + + def setUp(self): + self.model_tester = TFGPTJModelTester(self) + self.config_tester = ConfigTester(self, config_class=GPTJConfig, n_embd=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_gptj_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gptj_model(*config_and_inputs) + + def test_gptj_model_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gptj_model_past(*config_and_inputs) + + def test_gptj_model_att_mask_past(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gptj_model_attention_mask_past(*config_and_inputs) + + def test_gptj_model_past_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gptj_model_past_large_inputs(*config_and_inputs) + + def test_gptj_lm_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_gptj_lm_head_model(*config_and_inputs) + + @slow + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) > 0, + "skip testing on GPU for now to avoid GPU OOM.", + ) + def test_model_from_pretrained(self): + model = TFGPTJModel.from_pretrained("EleutherAI/gpt-j-6B", from_pt=True) + self.assertIsNotNone(model) + + @unittest.skip(reason="Currently, model embeddings are going to undergo a major refactor.") + def test_resize_token_embeddings(self): + super().test_resize_token_embeddings() + + +@require_tf +@tooslow +# Marked as @tooslow due to GPU OOM -- but still useful to run locally. Requires ~39GB of RAM. +class TFGPTJModelLanguageGenerationTest(unittest.TestCase): + def test_lm_generate_gptj(self): + model = TFGPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", from_pt=True) + input_ids = tf.convert_to_tensor([[464, 3290]], dtype=tf.int32) # The dog + # The dog is a man's best friend. It is a loyal companion, and it is a friend + expected_output_ids = [464, 3290, 318, 257, 582, 338, 1266, 1545, 13, 632, 318, 257, 9112, 15185, 11, 290, 340, 318, 257, 1545] # fmt: skip + output_ids = model.generate(input_ids, do_sample=False) + self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids) + + def test_gptj_sample(self): + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16") + model = TFGPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", from_pt=True) + + tokenized = tokenizer("Today is a nice day and", return_tensors="tf") + # forces the generation to happen on CPU, to avoid GPU-related quirks + with tf.device(":/CPU:0"): + output_ids = model.generate(**tokenized, do_sample=True, seed=[42, 0]) + output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True) + + EXPECTED_OUTPUT_STR = "Today is a nice day and I’m going to go for a walk. I’" + self.assertEqual(output_str, EXPECTED_OUTPUT_STR) + + def _get_beam_search_test_objects(self): + model = TFGPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", from_pt=True) + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16") + + tokenizer.padding_side = "left" + + # Define PAD Token = EOS Token = 50256 + tokenizer.pad_token = tokenizer.eos_token + model.config.pad_token_id = model.config.eos_token_id + + # use different length sentences to test batching + sentences = [ + "Hello, my dog is a little", + "Today, I", + ] + expected_output_sentences = [ + "Hello, my dog is a little over a year old and has been diagnosed with hip dysplasia", + "Today, I’m going to be talking about a topic that’", + ] + return model, tokenizer, sentences, expected_output_sentences + + def test_batch_beam_search(self): + # Confirms that we get the expected results with left-padded beam search + model, tokenizer, sentences, expected_output_sentences = self._get_beam_search_test_objects() + + inputs = tokenizer(sentences, return_tensors="tf", padding=True) + outputs = model.generate(**inputs, do_sample=False, num_beams=2) + batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) + self.assertListEqual(expected_output_sentences, batch_out_sentence) + + def test_batch_left_padding(self): + # Confirms that left-padding is working properly + model, tokenizer, sentences, expected_output_sentences = self._get_beam_search_test_objects() + + inputs = tokenizer(sentences, return_tensors="tf", padding=True) + inputs_non_padded = tokenizer(sentences[0], return_tensors="tf") + output_non_padded = model.generate(**inputs_non_padded, do_sample=False, num_beams=2) + num_paddings = ( + shape_list(inputs_non_padded["input_ids"])[-1] + - tf.reduce_sum(tf.cast(inputs["attention_mask"][-1], tf.int64)).numpy() + ) + inputs_padded = tokenizer(sentences[1], return_tensors="tf") + output_padded = model.generate( + **inputs_padded, do_sample=False, num_beams=2, max_length=model.config.max_length - num_paddings + ) + non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) + padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) + self.assertListEqual(expected_output_sentences, [non_padded_sentence, padded_sentence]) + + def test_xla_beam_search(self): + # Confirms that XLA is working properly + model, tokenizer, sentences, expected_output_sentences = self._get_beam_search_test_objects() + + inputs = tokenizer(sentences, return_tensors="tf", padding=True) + xla_generate = tf.function(model.generate, jit_compile=True) + outputs_xla = xla_generate(**inputs, do_sample=False, num_beams=2) + xla_sentence = tokenizer.batch_decode(outputs_xla, skip_special_tokens=True) + self.assertListEqual(expected_output_sentences, xla_sentence) diff --git a/docs/transformers/tests/models/granite/__init__.py b/docs/transformers/tests/models/granite/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/granite/test_modeling_granite.py b/docs/transformers/tests/models/granite/test_modeling_granite.py new file mode 100644 index 0000000000000000000000000000000000000000..be1b5841ff8a7a857b2d373f11a0b8f990e9d9d6 --- /dev/null +++ b/docs/transformers/tests/models/granite/test_modeling_granite.py @@ -0,0 +1,383 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Granite model.""" + +import unittest + +from parameterized import parameterized + +from transformers import GraniteConfig, is_torch_available, set_seed +from transformers.testing_utils import ( + Expectations, + require_read_token, + require_torch, + require_torch_accelerator, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + GraniteForCausalLM, + GraniteModel, + ) + from transformers.models.granite.modeling_granite import ( + GraniteRotaryEmbedding, + ) + + +class GraniteModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return GraniteConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = GraniteModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + GraniteModel, + GraniteForCausalLM, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": GraniteModel, + "text-generation": GraniteForCausalLM, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + fx_compatible = False + + # Need to use `0.8` instead of `0.9` for `test_cpu_offload` + # This is because we are hitting edge cases with the causal_mask buffer + model_split_percents = [0.5, 0.7, 0.8] + + def setUp(self): + self.model_tester = GraniteModelTester(self) + self.config_tester = ConfigTester(self, config_class=GraniteConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + @parameterized.expand([("linear",), ("dynamic",)]) + def test_model_rope_scaling_from_config(self, scaling_type): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + short_input = ids_tensor([1, 10], config.vocab_size) + long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + original_model = GraniteModel(config) + original_model.to(torch_device) + original_model.eval() + original_short_output = original_model(short_input).last_hidden_state + original_long_output = original_model(long_input).last_hidden_state + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + config.rope_scaling = {"type": scaling_type, "factor": 10.0} + scaled_model = GraniteModel(config) + scaled_model.to(torch_device) + scaled_model.eval() + scaled_short_output = scaled_model(short_input).last_hidden_state + scaled_long_output = scaled_model(long_input).last_hidden_state + + # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original + # maximum sequence length, so the outputs for the short input should match. + if scaling_type == "dynamic": + torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) + else: + self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) + + # The output should be different for long inputs + self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + + def test_model_rope_scaling(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + + # Inputs + x = torch.randn( + 1, dtype=torch.float32, device=torch_device + ) # used exclusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Sanity check original RoPE + original_rope = GraniteRotaryEmbedding(config=config).to(torch_device) + original_cos_short, original_sin_short = original_rope(x, position_ids_short) + original_cos_long, original_sin_long = original_rope(x, position_ids_long) + torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) + + # Sanity check linear RoPE scaling + # New position "x" should match original position with index "x/scaling_factor" + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + linear_scaling_rope = GraniteRotaryEmbedding(config=config).to(torch_device) + linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) + linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) + torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) + for new_position in range(0, long_input_length, scaling_factor): + original_position = int(new_position // scaling_factor) + torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) + torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) + + # Sanity check Dynamic NTK RoPE scaling + # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase + # with scaling_factor (or that `inv_freq` decreases) + config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} + ntk_scaling_rope = GraniteRotaryEmbedding(config=config).to(torch_device) + ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) + ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) + torch.testing.assert_close(ntk_cos_short, original_cos_short) + torch.testing.assert_close(ntk_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_sin_long, original_sin_long) + self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) + + # Sanity check Yarn RoPE scaling + # Scaling should be over the entire input + config.rope_scaling = {"type": "yarn", "factor": scaling_factor} + yarn_scaling_rope = GraniteRotaryEmbedding(config=config).to(torch_device) + yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) + yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) + torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_short, original_cos_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_long, original_sin_long) + + +@require_torch_accelerator +class GraniteIntegrationTest(unittest.TestCase): + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + @slow + @require_read_token + def test_model_3b_logits_bf16(self): + input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338] + + model = GraniteForCausalLM.from_pretrained( + "ibm/PowerLM-3b", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager" + ) + + with torch.no_grad(): + out = model(torch.tensor([input_ids]).to(torch_device)) + # Expected mean on dim = -1 + + # fmt: off + EXPECTED_MEANS = Expectations( + { + ("xpu", 3): torch.tensor([[-3.1406, -2.5469, -2.6250, -2.1250, -2.6250, -2.6562, -2.6875, -2.9688]]), + ("cuda", 7): torch.tensor([[-1.9798, -3.1626, -2.8062, -2.3777, -2.7091, -2.2338, -2.5924, -2.3974]]), + ("cuda", 8): torch.tensor([[-3.1406, -2.5469, -2.6250, -2.1250, -2.6250, -2.6562, -2.6875, -2.9688]]), + } + ) + EXPECTED_MEAN = EXPECTED_MEANS.get_expectation() + + torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.mean(-1).float(), rtol=1e-2, atol=1e-2) + + # slicing logits[0, 0, 0:15] + EXPECTED_SLICES = Expectations( + { + ("xpu", 3): torch.tensor([[2.2031, -5.0625, -5.0625, -5.0625, -5.0625, -0.9180, -5.0625, -5.0625, -5.0625, -5.0625, -5.5312, -2.1719, -1.7891, -0.4922, -2.5469]]), + ("cuda", 7): torch.tensor([[4.8750, -2.1875, -2.1875, -2.1875, -2.1875, -2.8438, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875, -2.1875]]), + ("cuda", 8): torch.tensor([[2.0938, -5.0312, -5.0312, -5.0312, -5.0312, -1.0469, -5.0312, -5.0312, -5.0312, -5.0312, -5.5625, -2.1875, -1.7891, -0.5820, -2.6250]]), + } + ) + EXPECTED_SLICE = EXPECTED_SLICES.get_expectation() + # fmt: on + self.assertTrue( + torch.allclose( + EXPECTED_SLICE.to(torch_device), + out.logits[0, 0, :15].float(), + atol=1e-3, + rtol=1e-3, + ) + ) + + @slow + @require_read_token + def test_model_3b_logits(self): + input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338] + + model = GraniteForCausalLM.from_pretrained("ibm/PowerLM-3b", device_map="auto", torch_dtype=torch.float16) + + with torch.no_grad(): + out = model(torch.tensor([input_ids]).to(torch_device)) + + # fmt: off + # Expected mean on dim = -1 + EXPECTED_MEANS = Expectations( + { + ("xpu", 3): torch.tensor([[-3.2693, -2.5957, -2.6234, -2.1675, -2.6386, -2.6850, -2.7039, -2.9656]]), + ("cuda", 7): torch.tensor([[-2.0984, -3.1294, -2.8153, -2.3568, -2.7337, -2.2624, -2.6016, -2.4022]]), + ("cuda", 8): torch.tensor([[-3.2934, -2.6019, -2.6258, -2.1691, -2.6394, -2.6876, -2.7032, -2.9688]]), + } + ) + EXPECTED_MEAN = EXPECTED_MEANS.get_expectation() + + torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2) diff --git a/docs/transformers/tests/models/granite_speech/__init__.py b/docs/transformers/tests/models/granite_speech/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/granite_speech/test_modeling_granite_speech.py b/docs/transformers/tests/models/granite_speech/test_modeling_granite_speech.py new file mode 100644 index 0000000000000000000000000000000000000000..02b1c4600b9ebf7421cac36870d8c533fe463036 --- /dev/null +++ b/docs/transformers/tests/models/granite_speech/test_modeling_granite_speech.py @@ -0,0 +1,393 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the IBM Granite Speech model.""" + +import tempfile +import unittest + +import pytest + +from transformers import ( + AutoProcessor, + GraniteSpeechConfig, + GraniteSpeechForConditionalGeneration, +) +from transformers.testing_utils import ( + cleanup, + require_torch, + require_torch_sdpa, + slow, + torch_device, +) +from transformers.utils import ( + is_datasets_available, + is_torch_available, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + _config_zero_init, + floats_tensor, + ids_tensor, +) + + +if is_torch_available(): + import torch + +if is_datasets_available(): + from datasets import load_dataset + + +class GraniteSpeechForConditionalGenerationModelTester: + def __init__( + self, + parent, + seq_length=7, + encoder_config={ + "model_type": "granite_speech_encoder", + "context_size": 200, + "conv_expansion_factor": 2, + "conv_kernel_size": 15, + "dim_head": 32, + "dropout": 0.1, + "feedforward_mult": 4, + "hidden_dim": 32, + "input_dim": 160, + "num_heads": 4, + "num_layers": 2, + "output_dim": 42, + }, + text_config={ + "model_type": "granite", + "is_training": True, + "seq_length": 7, + "use_token_type_ids": False, + "use_labels": True, + "vocab_size": 99, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 580, + "type_vocab_size": 16, + "type_sequence_label_size": 2, + "initializer_range": 0.02, + "num_labels": 3, + "num_choices": 4, + "pad_token_id": 1, + }, + projector_config={ + "attention_probs_dropout_prob": 0.1, + "cross_attention_frequency": 1, + "encoder_hidden_size": 32, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 32, + "initializer_range": 0.02, + "intermediate_size": 256, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 2048, + "model_type": "blip_2_qformer", + "num_attention_heads": 4, + "num_hidden_layers": 2, + "position_embedding_type": "absolute", + "use_qformer_text_input": False, + "vocab_size": 30522, + }, + audio_token_index=0, + tie_word_embeddings=True, + initializer_range=0.02, + has_lora_adapter=True, + downsample_rate=5, + window_size=15, + is_training=True, + ): + self.parent = parent + self.encoder_config = encoder_config + self.text_config = text_config + self.projector_config = projector_config + self.audio_token_index = audio_token_index + self.tie_word_embeddings = tie_word_embeddings + self.initializer_range = initializer_range + self.has_lora_adapater = has_lora_adapter + self.downsample_rate = downsample_rate + self.window_size = window_size + self.is_training = is_training + + # Dims for audio features + self.sequence_dim = 844 + self.feature_dim = 160 + self.num_attention_heads = text_config["num_attention_heads"] + self.num_hidden_layers = text_config["num_hidden_layers"] + self.hidden_size = text_config["hidden_size"] + self.batch_size = 3 + self.pad_token_id = text_config["pad_token_id"] + self.seq_len = 7 + self.num_audio_tokens = 2 + self.seq_length = seq_length + self.num_audio_tokens + + def get_config(self): + return GraniteSpeechConfig( + encoder_config=self.encoder_config, + text_config=self.text_config, + projector_config=self.projector_config, + audio_token_index=self.audio_token_index, + tie_word_embeddings=self.tie_word_embeddings, + initializer_range=self.initializer_range, + has_lora_adapter=self.has_lora_adapater, + ) + + def prepare_config_and_inputs(self): + input_features = floats_tensor( + [self.batch_size, self.sequence_dim, self.feature_dim], + ) + config = self.get_config() + return config, input_features + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_features = config_and_inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 + attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) + input_ids[input_ids == config.audio_token_index] = self.pad_token_id + + input_ids[:, : self.num_audio_tokens] = config.audio_token_index + + inputs_dict = { + "input_features": input_features, + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + def create_and_check_granite_speech_model_fp16_forward(self, config, input_ids, input_features, attention_mask): + model = GraniteSpeechForConditionalGeneration(config=config) + model.to(torch_device) + model.half() + model.eval() + logits = model( + input_ids=input_ids, + attention_mask=attention_mask, + input_features=input_features, + return_dict=True, + )["logits"] + self.parent.assertFalse(torch.isnan(logits).any().item()) + + def create_and_check_granite_speech_model_fp16_autocast_forward( + self, + config, + input_ids, + input_features, + attention_mask, + ): + config.torch_dtype = torch.float16 + model = GraniteSpeechForConditionalGeneration(config=config) + model.to(torch_device) + model.eval() + with torch.autocast(device_type="cuda", dtype=torch.float16): + logits = model( + input_ids=input_ids, + attention_mask=attention_mask, + input_features=input_features.to(torch.bfloat16), + return_dict=True, + )["logits"] + self.parent.assertFalse(torch.isnan(logits).any().item()) + + +@require_torch +class GraniteSpeechForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + """ + Model tester for `GraniteSpeechForConditionalGeneration`. + """ + + all_model_classes = (GraniteSpeechForConditionalGeneration,) if is_torch_available() else () + test_pruning = False + test_head_masking = False + _is_composite = True + + def setUp(self): + self.model_tester = GraniteSpeechForConditionalGenerationModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=GraniteSpeechConfig, + has_text_modality=False, + ) + + def test_inputs_embeds(self): + # overwrite inputs_embeds tests because we need to delete "input features" for the audio model + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["input_features"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + def test_initialization(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if name == "projector.query": + continue + elif param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + # overwrite because Granite Speech is audio+text model (not vision+text) + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + # NOTE - currently we only enable alternate attention implementations on + # the encapsulated LLM; in the future, this should be added for the conformer + # encoder as well. + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model.language_model.config._attn_implementation == text_attn) + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + +class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase): + def setUp(self): + # TODO - use the actual model path on HF hub after release. + self.model_path = "ibm-granite/granite-speech" + self.processor = AutoProcessor.from_pretrained(self.model_path) + self.prompt = self._get_prompt(self.processor.tokenizer) + + def tearDown(self): + cleanup(torch_device, gc_collect=True) + + def _get_prompt(self, tokenizer): + chat = [ + { + "role": "system", + "content": "Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant", + }, + { + "role": "user", + "content": "<|audio|>can you transcribe the speech into a written format?", + }, + ] + return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) + + def _load_datasamples(self, num_samples): + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + + return [x["array"] for x in speech_samples] + + @slow + @pytest.mark.skip("Public models not yet available") + def test_small_model_integration_test_single(self): + model = GraniteSpeechForConditionalGeneration.from_pretrained(self.model_path).to(torch_device) + input_speech = self._load_datasamples(1) + + # Verify feature sizes; note that the feature mask refers to the size of + # features that are masked into the LLM, not the output of the processor, + # which is why we inspect the mask instead of the `num_features` tensor. + inputs = self.processor(self.prompt, input_speech, return_tensors="pt").to(torch_device) + + num_computed_features = self.processor.audio_processor._get_num_audio_features( + [speech_arr.shape[-1] for speech_arr in input_speech], + )[0] + num_actual_features = torch.sum(inputs["input_features_mask"]).item() + assert num_actual_features == num_computed_features + + # verify generation + output = model.generate(**inputs, max_new_tokens=32) + EXPECTED_DECODED_TEXT = "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantmister quilter is the apostle of the middle classes and we are glad to welcome his gospel" # fmt: skip + + self.assertEqual( + self.processor.tokenizer.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @pytest.mark.skip("Public models not yet available") + def test_small_model_integration_test_batch(self): + model = GraniteSpeechForConditionalGeneration.from_pretrained(self.model_path) + input_speech = self._load_datasamples(2) + prompts = [self.prompt, self.prompt] + + # Verify feature sizes & padding + inputs = self.processor(prompts, input_speech, return_tensors="pt").to(model.device) + num_computed_features = self.processor.audio_processor._get_num_audio_features( + [speech_arr.shape[-1] for speech_arr in input_speech], + ) + num_actual_features = torch.sum(inputs["input_features_mask"], dim=-1) + for e_feats, a_feats in zip(num_computed_features, num_actual_features): + assert e_feats == a_feats.item() + + # verify generation + output = model.generate(**inputs, max_new_tokens=32) + + EXPECTED_DECODED_TEXT = [ + "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantmister quilter is the apostle of the middle classes and we are glad to welcome his gospel", + "systemKnowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant\nusercan you transcribe the speech into a written format?\nassistantnor is mister quilter's manner less interesting than his matter" + ] # fmt: skip + + self.assertEqual( + self.processor.tokenizer.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) diff --git a/docs/transformers/tests/models/granite_speech/test_processor_granite_speech.py b/docs/transformers/tests/models/granite_speech/test_processor_granite_speech.py new file mode 100644 index 0000000000000000000000000000000000000000..a566658f63dfb386cdb071f830b6194b5641c318 --- /dev/null +++ b/docs/transformers/tests/models/granite_speech/test_processor_granite_speech.py @@ -0,0 +1,222 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import shutil +import tempfile +import unittest + +import numpy as np +import pytest +import torch +from parameterized import parameterized + +from transformers import AutoTokenizer, GPT2TokenizerFast +from transformers.testing_utils import ( + require_torch, + require_torch_gpu, + require_torchaudio, +) +from transformers.utils import is_torchaudio_available + + +if is_torchaudio_available(): + from transformers import GraniteSpeechFeatureExtractor, GraniteSpeechProcessor + + +@pytest.skip("Public models not yet available", allow_module_level=True) +@require_torch +@require_torchaudio +class GraniteSpeechProcessorTest(unittest.TestCase): + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + # TODO - use the actual model path on HF hub after release. + self.checkpoint = "ibm-granite/granite-speech" + processor = GraniteSpeechProcessor.from_pretrained(self.checkpoint) + processor.save_pretrained(self.tmpdirname) + + def get_tokenizer(self, **kwargs): + return AutoTokenizer.from_pretrained(self.checkpoint, **kwargs) + + def get_audio_processor(self, **kwargs): + return GraniteSpeechFeatureExtractor.from_pretrained(self.checkpoint, **kwargs) + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def test_save_load_pretrained_default(self): + """Ensure we can save / reload a processor correctly.""" + tokenizer = self.get_tokenizer() + audio_processor = self.get_audio_processor() + processor = GraniteSpeechProcessor( + tokenizer=tokenizer, + audio_processor=audio_processor, + ) + + processor.save_pretrained(self.tmpdirname) + processor = GraniteSpeechProcessor.from_pretrained(self.tmpdirname) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) + self.assertIsInstance(processor.tokenizer, GPT2TokenizerFast) + + self.assertEqual(processor.audio_processor.to_json_string(), audio_processor.to_json_string()) + self.assertIsInstance(processor.audio_processor, GraniteSpeechFeatureExtractor) + + def test_requires_text(self): + """Ensure we require text""" + tokenizer = self.get_tokenizer() + audio_processor = self.get_audio_processor() + processor = GraniteSpeechProcessor( + tokenizer=tokenizer, + audio_processor=audio_processor, + ) + + with pytest.raises(TypeError): + processor(text=None) + + def test_bad_text_fails(self): + """Ensure we gracefully fail if text is the wrong type.""" + tokenizer = self.get_tokenizer() + audio_processor = self.get_audio_processor() + + processor = GraniteSpeechProcessor(tokenizer=tokenizer, audio_processor=audio_processor) + with pytest.raises(TypeError): + processor(text=424, audio=None) + + def test_bad_nested_text_fails(self): + """Ensure we gracefully fail if text is the wrong nested type.""" + tokenizer = self.get_tokenizer() + audio_processor = self.get_audio_processor() + processor = GraniteSpeechProcessor( + tokenizer=tokenizer, + audio_processor=audio_processor, + ) + + with pytest.raises(TypeError): + processor(text=[424], audio=None) + + def test_bad_audio_fails(self): + """Ensure we gracefully fail if audio is the wrong type.""" + tokenizer = self.get_tokenizer() + audio_processor = self.get_audio_processor() + processor = GraniteSpeechProcessor( + tokenizer=tokenizer, + audio_processor=audio_processor, + ) + + with pytest.raises(TypeError): + processor(text=None, audio="foo") + + def test_nested_bad_audio_fails(self): + """Ensure we gracefully fail if audio is the wrong nested type.""" + tokenizer = self.get_tokenizer() + audio_processor = self.get_audio_processor() + processor = GraniteSpeechProcessor( + tokenizer=tokenizer, + audio_processor=audio_processor, + ) + + with pytest.raises(TypeError): + processor(text=None, audio=["foo"]) + + @parameterized.expand( + [ + ([1, 269920], [171], torch.rand), + ([1, 269920], [171], np.random.rand), + ] + ) + def test_audio_token_filling_same_len_feature_tensors(self, vec_dims, num_expected_features, random_func): + """Ensure audio token filling is handled correctly when we have + one or more audio inputs whose features are all the same length + stacked into a tensor / numpy array. + + NOTE: Currently we enforce that each sample can only have one audio. + """ + tokenizer = self.get_tokenizer() + audio_processor = self.get_audio_processor() + processor = GraniteSpeechProcessor( + tokenizer=tokenizer, + audio_processor=audio_processor, + ) + audio = random_func(*vec_dims) - 0.5 + + audio_tokens = processor.audio_token * vec_dims[0] + inputs = processor(text=f"{audio_tokens} Can you compare this audio?", audio=audio, return_tensors="pt") + + # Check the number of audio tokens + audio_token_id = tokenizer.get_vocab()[processor.audio_token] + + # Make sure the number of audio tokens matches the number of features + num_computed_features = processor.audio_processor._get_num_audio_features( + [vec_dims[1] for _ in range(vec_dims[0])], + ) + num_audio_tokens = int(torch.sum(inputs["input_ids"] == audio_token_id)) + assert list(inputs["input_features"].shape) == [vec_dims[0], 844, 160] + assert sum(num_computed_features) == num_audio_tokens + + def test_audio_token_filling_varying_len_feature_list(self): + """Ensure audio token filling is handled correctly when we have + multiple varying len audio sequences passed as a list. + """ + tokenizer = self.get_tokenizer() + audio_processor = self.get_audio_processor() + processor = GraniteSpeechProcessor( + tokenizer=tokenizer, + audio_processor=audio_processor, + ) + vec_dims = [[1, 142100], [1, 269920]] + num_expected_features = [90, 171] + audio = [torch.rand(dims) - 0.5 for dims in vec_dims] + + inputs = processor( + text=[ + f"{processor.audio_token} Can you describe this audio?", + f"{processor.audio_token} How does it compare with this audio?", + ], + audio=audio, + return_tensors="pt", + ) + + # Check the number of audio tokens + audio_token_id = tokenizer.get_vocab()[processor.audio_token] + + # Make sure the number of audio tokens matches the number of features + num_calculated_features = processor.audio_processor._get_num_audio_features( + [dims[1] for dims in vec_dims], + ) + num_audio_tokens = int(torch.sum(inputs["input_ids"] == audio_token_id)) + assert num_calculated_features == [90, 171] + assert sum(num_expected_features) == num_audio_tokens + + @require_torch_gpu + def test_device_override(self): + """Ensure that we regardless of the processing device, the tensors + produced are on the CPU. + """ + tokenizer = self.get_tokenizer() + audio_processor = self.get_audio_processor() + processor = GraniteSpeechProcessor( + tokenizer=tokenizer, + audio_processor=audio_processor, + ) + + vec_dims = [1, 269920] + wav = torch.rand(vec_dims) - 0.5 + + inputs = processor( + text=f"{processor.audio_token} Can you transcribe this audio?", + audio=wav, + return_tensors="pt", + device="cuda", + ) + + assert inputs["input_features"].device.type == "cpu" diff --git a/docs/transformers/tests/models/granitemoe/__init__.py b/docs/transformers/tests/models/granitemoe/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/granitemoe/test_modeling_granitemoe.py b/docs/transformers/tests/models/granitemoe/test_modeling_granitemoe.py new file mode 100644 index 0000000000000000000000000000000000000000..e451ff30c84eb695911624e86596e7be887d76a9 --- /dev/null +++ b/docs/transformers/tests/models/granitemoe/test_modeling_granitemoe.py @@ -0,0 +1,392 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch GraniteMoe model.""" + +import unittest + +from parameterized import parameterized + +from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed +from transformers.testing_utils import ( + Expectations, + require_read_token, + require_torch, + require_torch_accelerator, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor + + +if is_torch_available(): + import torch + + from transformers import ( + GraniteMoeForCausalLM, + GraniteMoeModel, + ) + from transformers.models.granitemoe.modeling_granitemoe import ( + GraniteMoeRotaryEmbedding, + ) + + +class GraniteMoeModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return GraniteMoeConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = GraniteMoeModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + GraniteMoeModel, + GraniteMoeForCausalLM, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": GraniteMoeModel, + "text-generation": GraniteMoeForCausalLM, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + fx_compatible = False + + # Need to use `0.8` instead of `0.9` for `test_cpu_offload` + # This is because we are hitting edge cases with the causal_mask buffer + model_split_percents = [0.5, 0.7, 0.8] + + def setUp(self): + self.model_tester = GraniteMoeModelTester(self) + self.config_tester = ConfigTester(self, config_class=GraniteMoeConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + @parameterized.expand([("linear",), ("dynamic",)]) + def test_model_rope_scaling_from_config(self, scaling_type): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + short_input = ids_tensor([1, 10], config.vocab_size) + long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + original_model = GraniteMoeModel(config) + original_model.to(torch_device) + original_model.eval() + original_short_output = original_model(short_input).last_hidden_state + original_long_output = original_model(long_input).last_hidden_state + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + config.rope_scaling = {"type": scaling_type, "factor": 10.0} + scaled_model = GraniteMoeModel(config) + scaled_model.to(torch_device) + scaled_model.eval() + scaled_short_output = scaled_model(short_input).last_hidden_state + scaled_long_output = scaled_model(long_input).last_hidden_state + + # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original + # maximum sequence length, so the outputs for the short input should match. + if scaling_type == "dynamic": + torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) + else: + self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) + + # The output should be different for long inputs + self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + + def test_model_rope_scaling(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + + # Inputs + x = torch.randn( + 1, dtype=torch.float32, device=torch_device + ) # used exclusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Sanity check original RoPE + original_rope = GraniteMoeRotaryEmbedding(config=config).to(torch_device) + original_cos_short, original_sin_short = original_rope(x, position_ids_short) + original_cos_long, original_sin_long = original_rope(x, position_ids_long) + torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) + + # Sanity check linear RoPE scaling + # New position "x" should match original position with index "x/scaling_factor" + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + linear_scaling_rope = GraniteMoeRotaryEmbedding(config=config).to(torch_device) + linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) + linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) + torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) + for new_position in range(0, long_input_length, scaling_factor): + original_position = int(new_position // scaling_factor) + torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) + torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) + + # Sanity check Dynamic NTK RoPE scaling + # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase + # with scaling_factor (or that `inv_freq` decreases) + config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} + ntk_scaling_rope = GraniteMoeRotaryEmbedding(config=config).to(torch_device) + ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) + ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) + torch.testing.assert_close(ntk_cos_short, original_cos_short) + torch.testing.assert_close(ntk_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_sin_long, original_sin_long) + self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) + + # Sanity check Yarn RoPE scaling + # Scaling should be over the entire input + config.rope_scaling = {"type": "yarn", "factor": scaling_factor} + yarn_scaling_rope = GraniteMoeRotaryEmbedding(config=config).to(torch_device) + yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) + yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) + torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_short, original_cos_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_long, original_sin_long) + + +@require_torch_accelerator +class GraniteMoeIntegrationTest(unittest.TestCase): + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + @slow + @require_read_token + def test_model_3b_logits(self): + input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338] + + model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto") + + with torch.no_grad(): + out = model(torch.tensor([input_ids]).to(torch_device)) + + # fmt: off + # Expected mean on dim = -1 + EXPECTED_MEANS = Expectations( + { + ("xpu", 3): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]), + ("cuda", 7): torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]), + ("cuda", 8): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]), + } + ) + EXPECTED_MEAN = EXPECTED_MEANS.get_expectation() + + torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2) + + # slicing logits[0, 0, 0:15] + EXPECTED_SLICES = Expectations( + { + ("xpu", 3): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]), + ("cuda", 7): torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]), + ("cuda", 8): torch.tensor([[2.5479, -9.2124, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2162, -9.2122, -6.3101, -3.6224, -3.6377, -5.2542, -5.2524]]), + } + ) + EXPECTED_SLICE = EXPECTED_SLICES.get_expectation() + # fmt: on + + self.assertTrue( + torch.allclose( + EXPECTED_SLICE.to(torch_device), + out.logits[0, 0, :15].float(), + atol=1e-3, + rtol=1e-3, + ) + ) + + @slow + def test_model_3b_generation(self): + # ground truth text generated with dola_layers="low", repetition_penalty=1.2 + EXPECTED_TEXT_COMPLETIONS = Expectations( + { + ("xpu", 3): ( + "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n" + "The first part is easy to understand. The second part is a little more difficult.\n\n" + "The second part of the theory of relativity is a little more difficult to understand.\n" + ), + ("cuda", 7): ( + "Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = " + "\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time" + ), + ("cuda", 8): ( + "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n" + "The first part is easy to understand. The second part is a little more difficult.\n\n" + "The second part of the theory of relativity is a little more difficult to understand.\n" + ), + } + ) + EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation() + + prompt = "Simply put, the theory of relativity states that " + tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b") + model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto") + model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + + # greedy generation outputs + generated_ids = model.generate(**model_inputs, max_new_tokens=64, top_p=None, temperature=1, do_sample=False) + text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) + + self.assertEqual(EXPECTED_TEXT_COMPLETION, text) diff --git a/docs/transformers/tests/models/granitemoeshared/__init__.py b/docs/transformers/tests/models/granitemoeshared/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/granitemoeshared/test_modeling_granitemoeshared.py b/docs/transformers/tests/models/granitemoeshared/test_modeling_granitemoeshared.py new file mode 100644 index 0000000000000000000000000000000000000000..5de3552c20f2bdc46d0c394b8aa3b3338c7badf9 --- /dev/null +++ b/docs/transformers/tests/models/granitemoeshared/test_modeling_granitemoeshared.py @@ -0,0 +1,395 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch GraniteMoeShared model.""" + +import unittest + +from parameterized import parameterized + +from transformers import AutoTokenizer, GraniteMoeSharedConfig, is_torch_available, set_seed +from transformers.testing_utils import ( + Expectations, + require_read_token, + require_torch, + require_torch_accelerator, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor + + +if is_torch_available(): + import torch + + from transformers import ( + GraniteMoeSharedForCausalLM, + GraniteMoeSharedModel, + ) + from transformers.models.granitemoeshared.modeling_granitemoeshared import ( + GraniteMoeSharedRotaryEmbedding, + ) + + +class GraniteMoeSharedModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + shared_intermediate_size=174, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.shared_intermediate_size = shared_intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return GraniteMoeSharedConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + shared_intermediate_size=self.shared_intermediate_size, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = GraniteMoeSharedModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class GraniteMoeSharedModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + GraniteMoeSharedModel, + GraniteMoeSharedForCausalLM, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": GraniteMoeSharedModel, + "text-generation": GraniteMoeSharedForCausalLM, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + fx_compatible = False + + # Need to use `0.8` instead of `0.9` for `test_cpu_offload` + # This is because we are hitting edge cases with the causal_mask buffer + model_split_percents = [0.5, 0.7, 0.8] + + def setUp(self): + self.model_tester = GraniteMoeSharedModelTester(self) + self.config_tester = ConfigTester(self, config_class=GraniteMoeSharedConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + @parameterized.expand([("linear",), ("dynamic",)]) + def test_model_rope_scaling_from_config(self, scaling_type): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + short_input = ids_tensor([1, 10], config.vocab_size) + long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + original_model = GraniteMoeSharedModel(config) + original_model.to(torch_device) + original_model.eval() + original_short_output = original_model(short_input).last_hidden_state + original_long_output = original_model(long_input).last_hidden_state + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + config.rope_scaling = {"type": scaling_type, "factor": 10.0} + scaled_model = GraniteMoeSharedModel(config) + scaled_model.to(torch_device) + scaled_model.eval() + scaled_short_output = scaled_model(short_input).last_hidden_state + scaled_long_output = scaled_model(long_input).last_hidden_state + + # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original + # maximum sequence length, so the outputs for the short input should match. + if scaling_type == "dynamic": + torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) + else: + self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) + + # The output should be different for long inputs + self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + + def test_model_rope_scaling(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + + # Inputs + x = torch.randn( + 1, dtype=torch.float32, device=torch_device + ) # used exclusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Sanity check original RoPE + original_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device) + original_cos_short, original_sin_short = original_rope(x, position_ids_short) + original_cos_long, original_sin_long = original_rope(x, position_ids_long) + torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) + + # Sanity check linear RoPE scaling + # New position "x" should match original position with index "x/scaling_factor" + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + linear_scaling_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device) + linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) + linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) + torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) + for new_position in range(0, long_input_length, scaling_factor): + original_position = int(new_position // scaling_factor) + torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) + torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) + + # Sanity check Dynamic NTK RoPE scaling + # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase + # with scaling_factor (or that `inv_freq` decreases) + config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} + ntk_scaling_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device) + ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) + ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) + torch.testing.assert_close(ntk_cos_short, original_cos_short) + torch.testing.assert_close(ntk_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_sin_long, original_sin_long) + self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) + + # Sanity check Yarn RoPE scaling + # Scaling should be over the entire input + config.rope_scaling = {"type": "yarn", "factor": scaling_factor} + yarn_scaling_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device) + yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) + yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) + torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_short, original_cos_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_long, original_sin_long) + + +@require_torch_accelerator +class GraniteMoeSharedIntegrationTest(unittest.TestCase): + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + @slow + @require_read_token + def test_model_3b_logits(self): + input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338] + + model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto") + + with torch.no_grad(): + out = model(torch.tensor([input_ids]).to(torch_device)) + + # fmt: off + # Expected mean on dim = -1 + EXPECTED_MEANS = Expectations( + { + ("xpu", 3): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]), + ("cuda", 7): torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]), + ("cuda", 8): torch.tensor([[-4.4005, -3.6689, -3.6187, -2.8308, -3.9871, -3.1001, -2.8738, -2.8063]]), + } + ) + + EXPECTED_MEAN = EXPECTED_MEANS.get_expectation() + torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2) + + # slicing logits[0, 0, 0:15] + EXPECTED_SLICES = Expectations( + { + ("xpu", 3): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]), + ("cuda", 7): torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]]), + ("cuda", 8): torch.tensor([[2.5479, -9.2123, -9.2121, -9.2175, -9.2122, -1.5024, -9.2121, -9.2122, -9.2161, -9.2122, -6.3100, -3.6223, -3.6377, -5.2542, -5.2523]]), + } + ) + EXPECTED_SLICE = EXPECTED_SLICES.get_expectation() + # fmt: on + + self.assertTrue( + torch.allclose( + EXPECTED_SLICE.to(torch_device), + out.logits[0, 0, :15].float(), + atol=1e-3, + rtol=1e-3, + ) + ) + + @slow + def test_model_3b_generation(self): + # ground truth text generated with dola_layers="low", repetition_penalty=1.2 + EXPECTED_TEXT_COMPLETIONS = Expectations( + { + ("xpu", 3): ( + "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n" + "The first part is easy to understand. The second part is a little more difficult.\n\n" + "The second part of the theory of relativity is a little more difficult to understand.\n" + ), + ("cuda", 7): ( + "Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = " + "\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time" + ), + ("cuda", 8): ( + "Simply put, the theory of relativity states that 1) the speed of light is constant, and 2) the speed of light is the same for all observers.\n\n" + "The first part is easy to understand. The second part is a little more difficult.\n\n" + "The second part of the theory of relativity is a little more difficult to understand.\n" + ), + } + ) + EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation() + + prompt = "Simply put, the theory of relativity states that " + tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b") + model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto") + model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + + # greedy generation outputs + generated_ids = model.generate(**model_inputs, max_new_tokens=64, top_p=None, temperature=1, do_sample=False) + text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) + + self.assertEqual(EXPECTED_TEXT_COMPLETION, text) diff --git a/docs/transformers/tests/models/grounding_dino/__init__.py b/docs/transformers/tests/models/grounding_dino/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/docs/transformers/tests/models/grounding_dino/test_image_processing_grounding_dino.py new file mode 100644 index 0000000000000000000000000000000000000000..2c4ecb297e62db69e5db3ead2cbda5bcf89757ca --- /dev/null +++ b/docs/transformers/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -0,0 +1,646 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import pathlib +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision, slow +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + + from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput + +if is_vision_available(): + from PIL import Image + + from transformers import GroundingDinoImageProcessor + + if is_torchvision_available(): + from transformers import GroundingDinoImageProcessorFast + + +class GroundingDinoImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_rescale=True, + rescale_factor=1 / 255, + do_pad=True, + ): + # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p + size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_pad = do_pad + self.num_queries = 5 + self.embed_dim = 5 + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.prepare_image_processor_dict with DeformableDetr->GroundingDino + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_rescale": self.do_rescale, + "rescale_factor": self.rescale_factor, + "do_pad": self.do_pad, + } + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.get_expected_values with DeformableDetr->GroundingDino + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to GroundingDinoImageProcessor, + assuming do_resize is set to True with a scalar size. + """ + if not batched: + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] + else: + h, w = image.shape[1], image.shape[2] + if w < h: + expected_height = int(self.size["shortest_edge"] * h / w) + expected_width = self.size["shortest_edge"] + elif w > h: + expected_height = self.size["shortest_edge"] + expected_width = int(self.size["shortest_edge"] * w / h) + else: + expected_height = self.size["shortest_edge"] + expected_width = self.size["shortest_edge"] + + else: + expected_values = [] + for image in image_inputs: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.expected_output_image_shape with DeformableDetr->GroundingDino + def expected_output_image_shape(self, images): + height, width = self.get_expected_values(images, batched=True) + return self.num_channels, height, width + + def get_fake_grounding_dino_output(self): + torch.manual_seed(42) + return GroundingDinoObjectDetectionOutput( + pred_boxes=torch.rand(self.batch_size, self.num_queries, 4), + logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim), + ) + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.prepare_image_inputs with DeformableDetr->GroundingDino + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None + fast_image_processing_class = GroundingDinoImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = GroundingDinoImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_image_processor_properties with DeformableDetr->GroundingDino + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "do_pad")) + self.assertTrue(hasattr(image_processing, "size")) + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_image_processor_from_dict_with_kwargs with DeformableDetr->GroundingDino + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) + self.assertEqual(image_processor.do_pad, True) + + image_processor = image_processing_class.from_dict( + self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False + ) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) + self.assertEqual(image_processor.do_pad, False) + + def test_post_process_object_detection(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class(**self.image_processor_dict) + outputs = self.image_processor_tester.get_fake_grounding_dino_output() + results = image_processor.post_process_object_detection(outputs, threshold=0.0) + + self.assertEqual(len(results), self.image_processor_tester.batch_size) + self.assertEqual(list(results[0].keys()), ["scores", "labels", "boxes"]) + self.assertEqual(results[0]["boxes"].shape, (self.image_processor_tester.num_queries, 4)) + self.assertEqual(results[0]["scores"].shape, (self.image_processor_tester.num_queries,)) + + expected_scores = torch.tensor([0.7050, 0.7222, 0.7222, 0.6829, 0.7220]) + torch.testing.assert_close(results[0]["scores"], expected_scores, rtol=1e-4, atol=1e-4) + + expected_box_slice = torch.tensor([0.6908, 0.4354, 1.0737, 1.3947]) + torch.testing.assert_close(results[0]["boxes"][0], expected_box_slice, rtol=1e-4, atol=1e-4) + + @slow + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_call_pytorch_with_coco_detection_annotations with DeformableDetr->GroundingDino + def test_call_pytorch_with_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class() + encoding = image_processing(images=image, annotations=target, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + # verify area + expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) + torch.testing.assert_close(encoding["labels"][0]["area"], expected_area) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) + torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3) + # verify image_id + expected_image_id = torch.tensor([39769]) + torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd) + # verify class_labels + expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) + torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size) + # verify size + expected_size = torch.tensor([800, 1066]) + torch.testing.assert_close(encoding["labels"][0]["size"], expected_size) + + @slow + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->GroundingDino + def test_batched_coco_detection_annotations(self): + image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800)) + + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt") as f: + target = json.loads(f.read()) + + annotations_0 = {"image_id": 39769, "annotations": target} + annotations_1 = {"image_id": 39769, "annotations": target} + + # Adjust the bounding boxes for the resized image + w_0, h_0 = image_0.size + w_1, h_1 = image_1.size + for i in range(len(annotations_1["annotations"])): + coords = annotations_1["annotations"][i]["bbox"] + new_bbox = [ + coords[0] * w_1 / w_0, + coords[1] * h_1 / h_0, + coords[2] * w_1 / w_0, + coords[3] * h_1 / h_0, + ] + annotations_1["annotations"][i]["bbox"] = new_bbox + + images = [image_0, image_1] + annotations = [annotations_0, annotations_1] + + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class() + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + return_tensors="pt", # do_convert_annotations=True + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.6879, 0.4609, 0.0755, 0.3691], + [0.2118, 0.3359, 0.2601, 0.1566], + [0.5011, 0.5000, 0.9979, 1.0000], + [0.5010, 0.5020, 0.9979, 0.9959], + [0.3284, 0.5944, 0.5884, 0.8112], + [0.8394, 0.5445, 0.3213, 0.9110], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.4130, 0.2765, 0.0453, 0.2215], + [0.1272, 0.2016, 0.1561, 0.0940], + [0.3757, 0.4933, 0.7488, 0.9865], + [0.3759, 0.5002, 0.7492, 0.9955], + [0.1971, 0.5456, 0.3532, 0.8646], + [0.5790, 0.4115, 0.3430, 0.7161], + ] + ) + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3, rtol=1e-3) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1) + + @slow + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_call_pytorch_with_coco_panoptic_annotations with DeformableDetr->GroundingDino + def test_call_pytorch_with_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f: + target = json.loads(f.read()) + + target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class(format="coco_panoptic") + encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + torch.testing.assert_close(encoding["pixel_values"][0, 0, 0, :3], expected_slice, rtol=1e-4, atol=1e-4) + + # verify area + expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) + torch.testing.assert_close(encoding["labels"][0]["area"], expected_area) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) + torch.testing.assert_close(encoding["labels"][0]["boxes"][0], expected_boxes_slice, rtol=1e-3, atol=1e-3) + # verify image_id + expected_image_id = torch.tensor([39769]) + torch.testing.assert_close(encoding["labels"][0]["image_id"], expected_image_id) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + torch.testing.assert_close(encoding["labels"][0]["iscrowd"], expected_is_crowd) + # verify class_labels + expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) + torch.testing.assert_close(encoding["labels"][0]["class_labels"], expected_class_labels) + # verify masks + expected_masks_sum = 822873 + relative_error = torch.abs(encoding["labels"][0]["masks"].sum() - expected_masks_sum) / expected_masks_sum + self.assertTrue(relative_error < 1e-3) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + torch.testing.assert_close(encoding["labels"][0]["orig_size"], expected_orig_size) + # verify size + expected_size = torch.tensor([800, 1066]) + torch.testing.assert_close(encoding["labels"][0]["size"], expected_size) + + @slow + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->GroundingDino + def test_batched_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800)) + + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt") as f: + target = json.loads(f.read()) + + annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + w_0, h_0 = image_0.size + w_1, h_1 = image_1.size + for i in range(len(annotation_1["segments_info"])): + coords = annotation_1["segments_info"][i]["bbox"] + new_bbox = [ + coords[0] * w_1 / w_0, + coords[1] * h_1 / h_0, + coords[2] * w_1 / w_0, + coords[3] * h_1 / h_0, + ] + annotation_1["segments_info"][i]["bbox"] = new_bbox + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + images = [image_0, image_1] + annotations = [annotation_0, annotation_1] + + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class(format="coco_panoptic") + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_tensors="pt", + return_segmentation_masks=True, + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.2625, 0.5437, 0.4688, 0.8625], + [0.7719, 0.4104, 0.4531, 0.7125], + [0.5000, 0.4927, 0.9969, 0.9854], + [0.1688, 0.2000, 0.2063, 0.0917], + [0.5492, 0.2760, 0.0578, 0.2187], + [0.4992, 0.4990, 0.9984, 0.9979], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.1576, 0.3262, 0.2814, 0.5175], + [0.4634, 0.2463, 0.2720, 0.4275], + [0.3002, 0.2956, 0.5985, 0.5913], + [0.1013, 0.1200, 0.1238, 0.0550], + [0.3297, 0.1656, 0.0347, 0.1312], + [0.2997, 0.2994, 0.5994, 0.5987], + ] + ) + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1e-3, rtol=1e-3) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1e-3, rtol=1e-3) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + torch.testing.assert_close(encoding["labels"][0]["boxes"], expected_boxes_0, atol=1, rtol=1) + torch.testing.assert_close(encoding["labels"][1]["boxes"], expected_boxes_1, atol=1, rtol=1) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino + def test_max_width_max_height_resizing_and_pad_strategy(self): + for image_processing_class in self.image_processor_list: + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = image_processing_class( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = image_processing_class( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = image_processing_class( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) + + def test_longest_edge_shortest_edge_resizing_strategy(self): + image_1 = torch.ones([958, 653, 3], dtype=torch.uint8) + + # max size is set; width < height; + # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436 + image_processor = GroundingDinoImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 640, 436])) + + image_2 = torch.ones([653, 958, 3], dtype=torch.uint8) + # max size is set; height < width; + # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640 + image_processor = GroundingDinoImageProcessor( + size={"longest_edge": 640, "shortest_edge": 640}, + do_pad=False, + ) + inputs = image_processor(images=[image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 436, 640])) + + image_3 = torch.ones([100, 120, 3], dtype=torch.uint8) + # max size is set; width == size; height > max_size; + # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98 + image_processor = GroundingDinoImageProcessor( + size={"longest_edge": 118, "shortest_edge": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_3], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 98, 118])) + + image_4 = torch.ones([128, 50, 3], dtype=torch.uint8) + # max size is set; height == size; width < max_size; + # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128 + image_processor = GroundingDinoImageProcessor( + size={"longest_edge": 256, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_4], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 128, 50])) + + image_5 = torch.ones([50, 50, 3], dtype=torch.uint8) + # max size is set; height == width; width < max_size; + # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50 + image_processor = GroundingDinoImageProcessor( + size={"longest_edge": 117, "shortest_edge": 50}, + do_pad=False, + ) + inputs = image_processor(images=[image_5], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 50, 50])) diff --git a/docs/transformers/tests/models/grounding_dino/test_modeling_grounding_dino.py b/docs/transformers/tests/models/grounding_dino/test_modeling_grounding_dino.py new file mode 100644 index 0000000000000000000000000000000000000000..19a5054c981f66c3a550cc5917687b09b69ce104 --- /dev/null +++ b/docs/transformers/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -0,0 +1,836 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Grounding DINO model.""" + +import collections +import inspect +import math +import re +import unittest + +from datasets import load_dataset + +from transformers import ( + GroundingDinoConfig, + SwinConfig, + is_torch_available, + is_vision_available, +) +from transformers.file_utils import cached_property +from transformers.testing_utils import ( + is_flaky, + require_timm, + require_torch, + require_torch_accelerator, + require_vision, + slow, + torch_device, +) + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import GroundingDinoConfig, GroundingDinoForObjectDetection, GroundingDinoModel + from transformers.pytorch_utils import id_tensor_storage + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoProcessor + + +def generate_fake_bounding_boxes(n_boxes): + """Generate bounding boxes in the format (center_x, center_y, width, height)""" + # Validate the input + if not isinstance(n_boxes, int): + raise ValueError("n_boxes must be an integer") + if n_boxes <= 0: + raise ValueError("n_boxes must be a positive integer") + + # Generate random bounding boxes in the format (center_x, center_y, width, height) + bounding_boxes = torch.rand((n_boxes, 4)) + + # Extract the components + center_x = bounding_boxes[:, 0] + center_y = bounding_boxes[:, 1] + width = bounding_boxes[:, 2] + height = bounding_boxes[:, 3] + + # Ensure width and height do not exceed bounds + width = torch.min(width, torch.tensor(1.0)) + height = torch.min(height, torch.tensor(1.0)) + + # Ensure the bounding box stays within the normalized space + center_x = torch.where(center_x - width / 2 < 0, width / 2, center_x) + center_x = torch.where(center_x + width / 2 > 1, 1 - width / 2, center_x) + center_y = torch.where(center_y - height / 2 < 0, height / 2, center_y) + center_y = torch.where(center_y + height / 2 > 1, 1 - height / 2, center_y) + + # Combine back into bounding boxes + bounding_boxes = torch.stack([center_x, center_y, width, height], dim=1) + + return bounding_boxes + + +class GroundingDinoModelTester: + def __init__( + self, + parent, + batch_size=4, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_queries=2, + num_channels=3, + image_size=98, + n_targets=8, + num_labels=2, + num_feature_levels=4, + encoder_n_points=2, + decoder_n_points=6, + max_text_len=7, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_queries = num_queries + self.num_channels = num_channels + self.image_size = image_size + self.n_targets = n_targets + self.num_labels = num_labels + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + self.max_text_len = max_text_len + + # we also set the expected seq length for both encoder and decoder + self.encoder_seq_length_vision = ( + math.ceil(self.image_size / 8) ** 2 + + math.ceil(self.image_size / 16) ** 2 + + math.ceil(self.image_size / 32) ** 2 + + math.ceil(self.image_size / 64) ** 2 + ) + + self.encoder_seq_length_text = self.max_text_len + + self.decoder_seq_length = self.num_queries + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device) + + # When using `GroundingDino` the text input template is '{label1}. {label2}. {label3. ... {labelN}.' + # Therefore to avoid errors when running tests with `labels` `input_ids` have to follow this structure. + # Otherwise when running `build_label_maps` it will throw an error when trying to split the input_ids into segments. + input_ids = torch.tensor([101, 3869, 1012, 11420, 3869, 1012, 102], device=torch_device) + input_ids = input_ids.unsqueeze(0).expand(self.batch_size, -1) + + labels = None + if self.use_labels: + # labels is a list of Dict (each Dict being the labels for a given example in the batch) + labels = [] + for i in range(self.batch_size): + target = {} + target["class_labels"] = torch.randint( + high=self.num_labels, size=(self.n_targets,), device=torch_device + ) + target["boxes"] = generate_fake_bounding_boxes(self.n_targets).to(torch_device) + target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device) + labels.append(target) + + config = self.get_config() + return config, pixel_values, pixel_mask, input_ids, labels + + def get_config(self): + swin_config = SwinConfig( + window_size=7, + embed_dim=8, + depths=[1, 1, 1, 1], + num_heads=[1, 1, 1, 1], + image_size=self.image_size, + out_features=["stage2", "stage3", "stage4"], + out_indices=[2, 3, 4], + ) + text_backbone = { + "hidden_size": 8, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "intermediate_size": 8, + "max_position_embeddings": 8, + "model_type": "bert", + } + return GroundingDinoConfig( + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + num_queries=self.num_queries, + num_labels=self.num_labels, + num_feature_levels=self.num_feature_levels, + encoder_n_points=self.encoder_n_points, + decoder_n_points=self.decoder_n_points, + use_timm_backbone=False, + backbone_config=swin_config, + max_text_len=self.max_text_len, + text_config=text_backbone, + ) + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_mask, input_ids, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask, "input_ids": input_ids} + return config, inputs_dict + + def create_and_check_model(self, config, pixel_values, pixel_mask, input_ids, labels): + model = GroundingDinoModel(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size)) + + def create_and_check_object_detection_head_model(self, config, pixel_values, pixel_mask, input_ids, labels): + model = GroundingDinoForObjectDetection(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, config.max_text_len)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids, labels=labels) + + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, config.max_text_len)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + +@require_torch +class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (GroundingDinoModel, GroundingDinoForObjectDetection) if is_torch_available() else () + is_encoder_decoder = True + test_torchscript = False + test_pruning = False + test_head_masking = False + test_missing_keys = False + pipeline_model_mapping = ( + {"image-feature-extraction": GroundingDinoModel, "zero-shot-object-detection": GroundingDinoForObjectDetection} + if is_torch_available() + else {} + ) + + # special case for head models + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "GroundingDinoForObjectDetection": + labels = [] + for i in range(self.model_tester.batch_size): + target = {} + target["class_labels"] = torch.ones( + size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long + ) + target["boxes"] = torch.ones( + self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float + ) + target["masks"] = torch.ones( + self.model_tester.n_targets, + self.model_tester.image_size, + self.model_tester.image_size, + device=torch_device, + dtype=torch.float, + ) + labels.append(target) + inputs_dict["labels"] = labels + + return inputs_dict + + def setUp(self): + self.model_tester = GroundingDinoModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=GroundingDinoConfig, + has_text_modality=False, + common_properties=["d_model", "encoder_attention_heads", "decoder_attention_heads"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_object_detection_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_object_detection_head_model(*config_and_inputs) + + @unittest.skip(reason="Grounding DINO does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Grounding DINO does not have a get_input_embeddings method") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="Grounding DINO does not use token embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Feed forward chunking is not implemented") + def test_feed_forward_chunking(self): + pass + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions[-1] + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions[-1] + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.encoder_n_points, + ], + ) + out_len = len(outputs) + + correct_outlen = 12 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Object Detection model returns pred_logits and pred_boxes and input_ids + if model_class.__name__ == "GroundingDinoForObjectDetection": + correct_outlen += 3 + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions[0] + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, self.model_tester.num_queries, self.model_tester.num_queries], + ) + + # cross attentions + cross_attentions = outputs.decoder_attentions[-1] + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.decoder_n_points, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + 3, len(outputs)) + + self_attentions = outputs.encoder_attentions[-1] + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.encoder_n_points, + ], + ) + + # overwrite since hidden_states are called encoder_text_hidden_states + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_vision_hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + seq_len = self.model_tester.encoder_seq_length_vision + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_len, self.model_tester.hidden_size], + ) + + hidden_states = outputs.encoder_text_hidden_states + + self.assertEqual(len(hidden_states), expected_num_layers) + + seq_len = self.model_tester.encoder_seq_length_text + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_len, self.model_tester.hidden_size], + ) + + hidden_states = outputs.decoder_hidden_states + + self.assertIsInstance(hidden_states, (list, tuple)) + self.assertEqual(len(hidden_states), expected_num_layers) + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [decoder_seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad + def test_retain_grad_hidden_states_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + output = outputs[0] + + encoder_hidden_states = outputs.encoder_vision_hidden_states[0] + encoder_attentions = outputs.encoder_attentions[0][0] + encoder_hidden_states.retain_grad() + encoder_attentions.retain_grad() + + cross_attentions = outputs.decoder_attentions[-1][0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values", "input_ids"] + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + def test_different_timm_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # let's pick a random timm backbone + config.backbone = "tf_mobilenetv3_small_075" + config.use_timm_backbone = True + config.backbone_config = None + config.backbone_kwargs = {"in_chans": 3, "out_indices": (2, 3, 4)} + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "GroundingDinoForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + config.max_text_len, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + + self.assertTrue(outputs) + + @require_timm + def test_hf_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Load a pretrained HF checkpoint as backbone + config.backbone = "microsoft/resnet-18" + config.backbone_config = None + config.use_timm_backbone = False + config.use_pretrained_backbone = True + config.backbone_kwargs = {"out_indices": [2, 3, 4]} + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "GroundingDinoForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + config.max_text_len, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + + self.assertTrue(outputs) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + if ( + "level_embed" in name + or "sampling_offsets.bias" in name + or "text_param" in name + or "vision_param" in name + or "value_proj" in name + or "output_proj" in name + or "reference_points" in name + ): + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + # Copied from tests.models.deformable_detr.test_modeling_deformable_detr.DeformableDetrModelTest.test_two_stage_training with DeformableDetr->GroundingDino + def test_two_stage_training(self): + model_class = GroundingDinoForObjectDetection + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + config.two_stage = True + config.auxiliary_loss = True + config.with_box_refine = True + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_tied_weights_keys(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + config.tie_word_embeddings = True + for model_class in self.all_model_classes: + model_tied = model_class(config) + + ptrs = collections.defaultdict(list) + for name, tensor in model_tied.state_dict().items(): + ptrs[id_tensor_storage(tensor)].append(name) + + # These are all the pointers of shared tensors. + tied_params = [names for _, names in ptrs.items() if len(names) > 1] + + tied_weight_keys = model_tied._tied_weights_keys if model_tied._tied_weights_keys is not None else [] + # Detect we get a hit for each key + for key in tied_weight_keys: + if not any(re.search(key, p) for group in tied_params for p in group): + raise ValueError(f"{key} is not a tied weight key for {model_class}.") + + # Removed tied weights found from tied params -> there should only be one left after + for key in tied_weight_keys: + for i in range(len(tied_params)): + tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None] + + # GroundingDino when sharing weights also uses the shared ones in GroundingDinoDecoder + # Therefore, differently from DeformableDetr, we expect the group lens to be 2 + # one for self.bbox_embed in GroundingDinoForObejectDetection and another one + # in the decoder + tied_params = [group for group in tied_params if len(group) > 2] + self.assertListEqual( + tied_params, + [], + f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.", + ) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +def prepare_text(): + text = "a cat." + return text + + +@require_timm +@require_vision +@slow +class GroundingDinoModelIntegrationTests(unittest.TestCase): + @cached_property + def default_processor(self): + return AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny") if is_vision_available() else None + + def test_inference_object_detection_head(self): + model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to(torch_device) + + processor = self.default_processor + image = prepare_img() + text = prepare_text() + encoding = processor(images=image, text=text, return_tensors="pt").to(torch_device) + + with torch.no_grad(): + outputs = model(**encoding) + + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.d_model)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + + expected_boxes = torch.tensor( + [[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]] + ).to(torch_device) + expected_logits = torch.tensor( + [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]] + ).to(torch_device) + + torch.testing.assert_close(outputs.logits[0, :3, :3], expected_logits, rtol=1e-3, atol=1e-3) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + torch.testing.assert_close(outputs.pred_boxes[0, :3, :3], expected_boxes, rtol=1e-4, atol=1e-4) + + # verify postprocessing + results = processor.image_processor.post_process_object_detection( + outputs, threshold=0.35, target_sizes=[(image.height, image.width)] + )[0] + expected_scores = torch.tensor([0.4526, 0.4082]).to(torch_device) + expected_slice_boxes = torch.tensor([344.8143, 23.1796, 637.4004, 373.8295]).to(torch_device) + + self.assertEqual(len(results["scores"]), 2) + torch.testing.assert_close(results["scores"], expected_scores, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes, rtol=1e-2, atol=1e-2) + + # verify grounded postprocessing + expected_labels = ["a cat", "a cat"] + results = processor.post_process_grounded_object_detection( + outputs=outputs, + input_ids=encoding.input_ids, + threshold=0.35, + text_threshold=0.3, + target_sizes=[(image.height, image.width)], + )[0] + + torch.testing.assert_close(results["scores"], expected_scores, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(results["boxes"][0, :], expected_slice_boxes, rtol=1e-2, atol=1e-2) + self.assertListEqual(results["text_labels"], expected_labels) + + @require_torch_accelerator + @is_flaky() + def test_inference_object_detection_head_equivalence_cpu_gpu(self): + processor = self.default_processor + image = prepare_img() + text = prepare_text() + encoding = processor(images=image, text=text, return_tensors="pt") + + # 1. run model on CPU + model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny") + + with torch.no_grad(): + cpu_outputs = model(**encoding) + + # 2. run model on GPU + model.to(torch_device) + encoding = encoding.to(torch_device) + with torch.no_grad(): + gpu_outputs = model(**encoding) + + # 3. assert equivalence + for key in cpu_outputs.keys(): + torch.testing.assert_close(cpu_outputs[key], gpu_outputs[key].cpu(), rtol=1e-3, atol=1e-3) + + expected_logits = torch.tensor( + [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]] + ) + torch.testing.assert_close(cpu_outputs.logits[0, :3, :3], expected_logits, rtol=1e-3, atol=1e-3) + + # assert postprocessing + results_cpu = processor.image_processor.post_process_object_detection( + cpu_outputs, threshold=0.35, target_sizes=[(image.height, image.width)] + )[0] + + result_gpu = processor.image_processor.post_process_object_detection( + gpu_outputs, threshold=0.35, target_sizes=[(image.height, image.width)] + )[0] + + torch.testing.assert_close(results_cpu["scores"], result_gpu["scores"].cpu(), rtol=1e-3, atol=1e-3) + torch.testing.assert_close(results_cpu["boxes"], result_gpu["boxes"].cpu(), rtol=1e-3, atol=1e-3) + + @is_flaky() + def test_cross_attention_mask(self): + model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to(torch_device) + + processor = self.default_processor + image = prepare_img() + text1 = "a cat." + text2 = "a remote control." + text_batched = [text1, text2] + + encoding1 = processor(images=image, text=text1, return_tensors="pt").to(torch_device) + encoding2 = processor(images=image, text=text2, return_tensors="pt").to(torch_device) + # If we batch the text and cross attention masking is working the batched result should be equal to + # The singe text result + encoding_batched = processor( + images=[image] * len(text_batched), text=text_batched, padding="longest", return_tensors="pt" + ).to(torch_device) + + with torch.no_grad(): + outputs1 = model(**encoding1) + outputs2 = model(**encoding2) + outputs_batched = model(**encoding_batched) + + torch.testing.assert_close(outputs1.logits, outputs_batched.logits[:1], rtol=1e-3, atol=1e-3) + # For some reason 12 elements are > 1e-3, but the rest are fine + self.assertTrue(torch.allclose(outputs2.logits, outputs_batched.logits[1:], atol=1.8e-3)) + + def test_grounding_dino_loss(self): + ds = load_dataset("EduardoPacheco/aquarium-sample", split="train") + image_processor = self.default_processor.image_processor + tokenizer = self.default_processor.tokenizer + id2label = {0: "fish", 1: "jellyfish", 2: "penguins", 3: "sharks", 4: "puffins", 5: "stingrays", 6: "starfish"} + prompt = ". ".join(id2label.values()) + "." + + text_inputs = tokenizer([prompt, prompt], return_tensors="pt") + image_inputs = image_processor(images=ds["image"], annotations=ds["annotations"], return_tensors="pt") + + # Passing auxiliary_loss=True to compare with the expected loss + model = GroundingDinoForObjectDetection.from_pretrained( + "IDEA-Research/grounding-dino-tiny", + auxiliary_loss=True, + ) + # Interested in the loss only + model.eval() + with torch.no_grad(): + outputs = model(**text_inputs, **image_inputs) + + # Loss differs by CPU and GPU, also this can be changed in future. + expected_loss_dict = { + "loss_ce": torch.tensor(1.1147), + "loss_bbox": torch.tensor(0.2031), + "loss_giou": torch.tensor(0.5819), + "loss_ce_0": torch.tensor(1.1941), + "loss_bbox_0": torch.tensor(0.1978), + "loss_giou_0": torch.tensor(0.5524), + "loss_ce_1": torch.tensor(1.1621), + "loss_bbox_1": torch.tensor(0.1909), + "loss_giou_1": torch.tensor(0.5892), + "loss_ce_2": torch.tensor(1.1641), + "loss_bbox_2": torch.tensor(0.1892), + "loss_giou_2": torch.tensor(0.5626), + "loss_ce_3": torch.tensor(1.1943), + "loss_bbox_3": torch.tensor(0.1941), + "loss_giou_3": torch.tensor(0.5607), + "loss_ce_4": torch.tensor(1.0956), + "loss_bbox_4": torch.tensor(0.2008), + "loss_giou_4": torch.tensor(0.5836), + "loss_ce_enc": torch.tensor(16226.3164), + "loss_bbox_enc": torch.tensor(0.3063), + "loss_giou_enc": torch.tensor(0.7380), + } + + expected_loss = torch.tensor(32482.2305) + + for key in expected_loss_dict: + self.assertTrue(torch.allclose(outputs.loss_dict[key], expected_loss_dict[key], atol=1e-3)) + + self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-3)) diff --git a/docs/transformers/tests/models/grounding_dino/test_processor_grounding_dino.py b/docs/transformers/tests/models/grounding_dino/test_processor_grounding_dino.py new file mode 100644 index 0000000000000000000000000000000000000000..35b77c39f2ba959ff25b2da96b39b5022dfce44a --- /dev/null +++ b/docs/transformers/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -0,0 +1,296 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import tempfile +import unittest +from typing import Optional + +import pytest + +from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor +from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_torch_available(): + import torch + + from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput + +if is_vision_available(): + from transformers import GroundingDinoImageProcessor + + +@require_torch +@require_vision +class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "IDEA-Research/grounding-dino-base" + processor_class = GroundingDinoProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + vocab_tokens = ["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]","want","##want","##ed","wa","un","runn","##ing",",","low","lowest"] # fmt: skip + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + image_processor_map = { + "do_resize": True, + "size": None, + "do_normalize": True, + "image_mean": [0.5, 0.5, 0.5], + "image_std": [0.5, 0.5, 0.5], + "do_rescale": True, + "rescale_factor": 1 / 255, + "do_pad": True, + } + cls.image_processor_file = os.path.join(cls.tmpdirname, IMAGE_PROCESSOR_NAME) + with open(cls.image_processor_file, "w", encoding="utf-8") as fp: + json.dump(image_processor_map, fp) + + image_processor = GroundingDinoImageProcessor() + tokenizer = BertTokenizer.from_pretrained(cls.from_pretrained_id) + + processor = GroundingDinoProcessor(image_processor, tokenizer) + + processor.save_pretrained(cls.tmpdirname) + + cls.batch_size = 7 + cls.num_queries = 5 + cls.embed_dim = 5 + cls.seq_length = 5 + + def prepare_text_inputs(self, batch_size: Optional[int] = None, modality: Optional[str] = None): + labels = ["a cat", "remote control"] + labels_longer = ["a person", "a car", "a dog", "a cat"] + + if batch_size is None: + return labels + + if batch_size < 1: + raise ValueError("batch_size must be greater than 0") + + if batch_size == 1: + return [labels] + return [labels, labels_longer] + [labels] * (batch_size - 2) + + @classmethod + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert + def get_tokenizer(cls, **kwargs): + return BertTokenizer.from_pretrained(cls.tmpdirname, **kwargs) + + @classmethod + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_rust_tokenizer with CLIP->Bert + def get_rust_tokenizer(cls, **kwargs): + return BertTokenizerFast.from_pretrained(cls.tmpdirname, **kwargs) + + @classmethod + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_image_processor with CLIP->GroundingDino + def get_image_processor(cls, **kwargs): + return GroundingDinoImageProcessor.from_pretrained(cls.tmpdirname, **kwargs) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + def get_fake_grounding_dino_output(self): + torch.manual_seed(42) + return GroundingDinoObjectDetectionOutput( + pred_boxes=torch.rand(self.batch_size, self.num_queries, 4), + logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim), + input_ids=self.get_fake_grounding_dino_input_ids(), + ) + + def get_fake_grounding_dino_input_ids(self): + input_ids = torch.tensor([101, 1037, 4937, 1012, 102]) + return torch.stack([input_ids] * self.batch_size, dim=0) + + def test_post_process_grounded_object_detection(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + grounding_dino_output = self.get_fake_grounding_dino_output() + + post_processed = processor.post_process_grounded_object_detection(grounding_dino_output) + + self.assertEqual(len(post_processed), self.batch_size) + self.assertEqual(list(post_processed[0].keys()), ["scores", "boxes", "text_labels", "labels"]) + self.assertEqual(post_processed[0]["boxes"].shape, (self.num_queries, 4)) + self.assertEqual(post_processed[0]["scores"].shape, (self.num_queries,)) + + expected_scores = torch.tensor([0.7050, 0.7222, 0.7222, 0.6829, 0.7220]) + torch.testing.assert_close(post_processed[0]["scores"], expected_scores, rtol=1e-4, atol=1e-4) + + expected_box_slice = torch.tensor([0.6908, 0.4354, 1.0737, 1.3947]) + torch.testing.assert_close(post_processed[0]["boxes"][0], expected_box_slice, rtol=1e-4, atol=1e-4) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_default with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer + def test_save_load_pretrained_default(self): + tokenizer_slow = self.get_tokenizer() + tokenizer_fast = self.get_rust_tokenizer() + image_processor = self.get_image_processor() + + with tempfile.TemporaryDirectory() as tmpdir: + processor_slow = GroundingDinoProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) + processor_slow.save_pretrained(tmpdir) + processor_slow = GroundingDinoProcessor.from_pretrained(tmpdir, use_fast=False) + + processor_fast = GroundingDinoProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast.save_pretrained(tmpdir) + processor_fast = GroundingDinoProcessor.from_pretrained(tmpdir) + + self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) + self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) + self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) + self.assertIsInstance(processor_slow.tokenizer, BertTokenizer) + self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) + + self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertIsInstance(processor_slow.image_processor, GroundingDinoImageProcessor) + self.assertIsInstance(processor_fast.image_processor, GroundingDinoImageProcessor) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_additional_features with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer + def test_save_load_pretrained_additional_features(self): + with tempfile.TemporaryDirectory() as tmpdir: + processor = GroundingDinoProcessor( + tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor() + ) + processor.save_pretrained(tmpdir) + + tokenizer_add_kwargs = BertTokenizer.from_pretrained(tmpdir, bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = GroundingDinoImageProcessor.from_pretrained( + tmpdir, do_normalize=False, padding_value=1.0 + ) + + processor = GroundingDinoProcessor.from_pretrained( + tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, BertTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, GroundingDinoImageProcessor) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_image_processor with CLIP->GroundingDino + def test_image_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + image_input = self.prepare_image_inputs() + + input_image_proc = image_processor(image_input, return_tensors="np") + input_processor = processor(images=image_input, return_tensors="np") + + for key in input_image_proc.keys(): + self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer with CLIP->GroundingDino + def test_tokenizer(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + + encoded_processor = processor(text=input_str) + + encoded_tok = tokenizer(input_str) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + def test_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual( + list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values", "pixel_mask"] + ) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer_decode with CLIP->GroundingDino + def test_tokenizer_decode(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_model_input_names with CLIP->GroundingDino + def test_model_input_names(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), processor.model_input_names) + + def test_text_preprocessing_equivalence(self): + processor = GroundingDinoProcessor.from_pretrained(self.tmpdirname) + + # check for single input + formatted_labels = "a cat. a remote control." + labels = ["a cat", "a remote control"] + inputs1 = processor(text=formatted_labels, return_tensors="pt") + inputs2 = processor(text=labels, return_tensors="pt") + self.assertTrue( + torch.allclose(inputs1["input_ids"], inputs2["input_ids"]), + f"Input ids are not equal for single input: {inputs1['input_ids']} != {inputs2['input_ids']}", + ) + + # check for batched input + formatted_labels = ["a cat. a remote control.", "a car. a person."] + labels = [["a cat", "a remote control"], ["a car", "a person"]] + inputs1 = processor(text=formatted_labels, return_tensors="pt", padding=True) + inputs2 = processor(text=labels, return_tensors="pt", padding=True) + self.assertTrue( + torch.allclose(inputs1["input_ids"], inputs2["input_ids"]), + f"Input ids are not equal for batched input: {inputs1['input_ids']} != {inputs2['input_ids']}", + ) diff --git a/docs/transformers/tests/models/groupvit/__init__.py b/docs/transformers/tests/models/groupvit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/groupvit/test_modeling_groupvit.py b/docs/transformers/tests/models/groupvit/test_modeling_groupvit.py new file mode 100644 index 0000000000000000000000000000000000000000..6226f6ff0f6a575157e62f91108ba1ff175b5cd1 --- /dev/null +++ b/docs/transformers/tests/models/groupvit/test_modeling_groupvit.py @@ -0,0 +1,723 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch GroupViT model.""" + +import inspect +import os +import random +import tempfile +import unittest + +import numpy as np +import requests + +from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig +from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device +from transformers.utils import is_torch_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + _config_zero_init, + floats_tensor, + ids_tensor, + random_attention_mask, +) +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import GroupViTModel, GroupViTTextModel, GroupViTVisionModel + + +if is_vision_available(): + from PIL import Image + + from transformers import CLIPProcessor + + +class GroupViTVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + depths=[6, 3, 3], + num_group_tokens=[64, 8, 0], + num_output_groups=[64, 8, 8], + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.depths = depths + self.num_hidden_layers = sum(depths) + self.expected_num_hidden_layers = len(depths) + 1 + self.num_group_tokens = num_group_tokens + self.num_output_groups = num_output_groups + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + num_patches = (image_size // patch_size) ** 2 + # no [CLS] token for GroupViT + self.seq_length = num_patches + + def prepare_config_and_inputs(self): + rng = random.Random(0) + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], rng=rng) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return GroupViTVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + depths=self.depths, + num_group_tokens=self.num_group_tokens, + num_output_groups=self.num_output_groups, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = GroupViTVisionModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.num_output_groups[-1], self.hidden_size) + ) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class GroupViTVisionModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as GROUPVIT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (GroupViTVisionModel,) if is_torch_available() else () + + test_pruning = False + test_torchscript = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = GroupViTVisionModelTester(self) + self.config_tester = ConfigTester( + self, config_class=GroupViTVisionConfig, has_text_modality=False, hidden_size=37 + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="GroupViT does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @is_flaky(description="The `index` computed with `max()` in `hard_softmax` is not stable.") + def test_batching_equivalence(self): + super().test_batching_equivalence() + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_len = getattr(self.model_tester, "seq_length", None) + + expected_num_attention_outputs = sum(g > 0 for g in self.model_tester.num_group_tokens) + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + # GroupViT returns attention grouping of each stage + self.assertEqual(len(attentions), sum(g > 0 for g in self.model_tester.num_group_tokens)) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + # GroupViT returns attention grouping of each stage + self.assertEqual(len(attentions), expected_num_attention_outputs) + + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.attentions + + # GroupViT returns attention grouping of each stage + self.assertEqual(len(self_attentions), expected_num_attention_outputs) + for i, self_attn in enumerate(self_attentions): + if self_attn is None: + continue + + self.assertListEqual( + list(self_attentions[i].shape[-2:]), + [ + self.model_tester.num_output_groups[i], + self.model_tester.num_output_groups[i - 1] if i > 0 else seq_len, + ], + ) + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + # override since the attention mask from GroupViT is not used to compute loss, thus no grad + def test_retain_grad_hidden_states_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = self.has_attentions + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + output = outputs[0] + + if config.is_encoder_decoder: + # Seq2Seq models + encoder_hidden_states = outputs.encoder_hidden_states[0] + encoder_hidden_states.retain_grad() + + decoder_hidden_states = outputs.decoder_hidden_states[0] + decoder_hidden_states.retain_grad() + + if self.has_attentions: + encoder_attentions = outputs.encoder_attentions[0] + encoder_attentions.retain_grad() + + decoder_attentions = outputs.decoder_attentions[0] + decoder_attentions.retain_grad() + + cross_attentions = outputs.cross_attentions[0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(decoder_hidden_states.grad) + + if self.has_attentions: + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(decoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + else: + # Encoder-/Decoder-only models + hidden_states = outputs.hidden_states[0] + hidden_states.retain_grad() + + if self.has_attentions: + attentions = outputs.attentions[0] + attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(hidden_states.grad) + + if self.has_attentions: + self.assertIsNone(attentions.grad) + + @slow + def test_model_from_pretrained(self): + model_name = "nvidia/groupvit-gcc-yfcc" + model = GroupViTVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class GroupViTTextModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + rng = random.Random(0) + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, rng=rng) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + config = self.get_config() + + return config, input_ids, input_mask + + def get_config(self): + return GroupViTTextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, input_ids, input_mask): + model = GroupViTTextModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, input_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class GroupViTTextModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (GroupViTTextModel,) if is_torch_available() else () + test_pruning = False + test_head_masking = False + + def setUp(self): + self.model_tester = GroupViTTextModelTester(self) + self.config_tester = ConfigTester(self, config_class=GroupViTTextConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip + def test_training(self): + pass + + @unittest.skip + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="GroupViTTextModel does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "nvidia/groupvit-gcc-yfcc" + model = GroupViTTextModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class GroupViTModelTester: + def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): + if text_kwargs is None: + text_kwargs = {} + if vision_kwargs is None: + vision_kwargs = {} + + self.parent = parent + self.text_model_tester = GroupViTTextModelTester(parent, **text_kwargs) + self.vision_model_tester = GroupViTVisionModelTester(parent, **vision_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test + self.is_training = is_training + + def prepare_config_and_inputs(self): + text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, attention_mask, pixel_values + + def get_config(self): + return GroupViTConfig.from_text_vision_configs( + self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64 + ) + + def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): + model = GroupViTModel(config).to(torch_device).eval() + with torch.no_grad(): + result = model(input_ids, pixel_values, attention_mask) + self.parent.assertEqual( + result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size) + ) + self.parent.assertEqual( + result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "return_loss": True, + } + return config, inputs_dict + + +@require_torch +class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (GroupViTModel,) if is_torch_available() else () + pipeline_model_mapping = {"feature-extraction": GroupViTModel} if is_torch_available() else {} + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + + def setUp(self): + self.model_tester = GroupViTModelTester(self) + common_properties = ["projection_dim", "projection_intermediate_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=GroupViTConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_config(self): + self.config_tester.run_common_tests() + + @is_flaky(description="The `index` computed with `max()` in `hard_softmax` is not stable.") + def test_batching_equivalence(self): + super().test_batching_equivalence() + + @unittest.skip(reason="hidden_states are tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="input_embeds are tested in individual model tests") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="GroupViTModel does not have input/output embeddings") + def test_model_get_set_embeddings(self): + pass + + # override as the `logit_scale` parameter initialization is different for GROUPVIT + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + # check if `logit_scale` is initialized as per the original implementation + if name == "logit_scale": + self.assertAlmostEqual( + param.data.item(), + np.log(1 / 0.07), + delta=1e-3, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def _create_and_check_torchscript(self, config, inputs_dict): + if not self.test_torchscript: + self.skipTest(reason="test_torchscript is set to False") + + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.torchscript = True + configs_no_init.return_dict = False + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + + try: + input_ids = inputs_dict["input_ids"] + pixel_values = inputs_dict["pixel_values"] # GROUPVIT needs pixel_values + traced_model = torch.jit.trace(model, (input_ids, pixel_values)) + except RuntimeError: + self.fail("Couldn't trace module.") + + with tempfile.TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") + + try: + torch.jit.save(traced_model, pt_file_name) + except Exception: + self.fail("Couldn't save module.") + + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load module.") + + model.to(torch_device) + model.eval() + + loaded_model.to(torch_device) + loaded_model.eval() + + model_state_dict = model.state_dict() + loaded_model_state_dict = loaded_model.state_dict() + + non_persistent_buffers = {} + for key in loaded_model_state_dict.keys(): + if key not in model_state_dict.keys(): + non_persistent_buffers[key] = loaded_model_state_dict[key] + + loaded_model_state_dict = { + key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers + } + + self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys())) + + model_buffers = list(model.buffers()) + for non_persistent_buffer in non_persistent_buffers.values(): + found_buffer = False + for i, model_buffer in enumerate(model_buffers): + if torch.equal(non_persistent_buffer, model_buffer): + found_buffer = True + break + + self.assertTrue(found_buffer) + model_buffers.pop(i) + + models_equal = True + for layer_name, p1 in model_state_dict.items(): + p2 = loaded_model_state_dict[layer_name] + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + def test_load_vision_text_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Save GroupViTConfig and check if we can load GroupViTVisionConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + vision_config = GroupViTVisionConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict()) + + # Save GroupViTConfig and check if we can load GroupViTTextConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + text_config = GroupViTTextConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict()) + + @slow + def test_model_from_pretrained(self): + model_name = "nvidia/groupvit-gcc-yfcc" + model = GroupViTModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@require_vision +@require_torch +class GroupViTModelIntegrationTest(unittest.TestCase): + @slow + def test_inference(self): + model_name = "nvidia/groupvit-gcc-yfcc" + model = GroupViTModel.from_pretrained(model_name) + processor = CLIPProcessor.from_pretrained(model_name) + + image = prepare_img() + inputs = processor( + text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt" + ) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + self.assertEqual( + outputs.logits_per_image.shape, + torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), + ) + self.assertEqual( + outputs.logits_per_text.shape, + torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), + ) + + expected_logits = torch.tensor([[13.3523, 6.3629]]) + + torch.testing.assert_close(outputs.logits_per_image, expected_logits, rtol=1e-3, atol=1e-3) diff --git a/docs/transformers/tests/models/groupvit/test_modeling_tf_groupvit.py b/docs/transformers/tests/models/groupvit/test_modeling_tf_groupvit.py new file mode 100644 index 0000000000000000000000000000000000000000..24ffc88a82204937053672670e37678dd067bc5f --- /dev/null +++ b/docs/transformers/tests/models/groupvit/test_modeling_tf_groupvit.py @@ -0,0 +1,695 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the TensorFlow GroupViT model.""" + +from __future__ import annotations + +import inspect +import os +import random +import tempfile +import unittest +from importlib import import_module + +import requests + +from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig +from transformers.testing_utils import ( + require_tensorflow_probability, + require_tf, + require_vision, + slow, +) +from transformers.utils import is_tf_available, is_vision_available + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import TFGroupViTModel, TFGroupViTTextModel, TFGroupViTVisionModel, TFSharedEmbeddings + from transformers.modeling_tf_utils import keras + + +if is_vision_available(): + from PIL import Image + + from transformers import CLIPProcessor + + +class TFGroupViTVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + depths=[6, 3, 3], + num_group_tokens=[64, 8, 0], + num_output_groups=[64, 8, 8], + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.depths = depths + self.num_hidden_layers = sum(depths) + self.expected_num_hidden_layers = len(depths) + 1 + self.num_group_tokens = num_group_tokens + self.num_output_groups = num_output_groups + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + num_patches = (image_size // patch_size) ** 2 + # no [CLS] token for GroupViT + self.seq_length = num_patches + + def prepare_config_and_inputs(self): + rng = random.Random(0) + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], rng=rng) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return GroupViTVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + depths=self.depths, + num_group_tokens=self.num_group_tokens, + num_output_groups=self.num_output_groups, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = TFGroupViTVisionModel(config=config) + result = model(pixel_values, training=False) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.num_output_groups[-1], self.hidden_size) + ) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_tf +class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as GroupViT does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (TFGroupViTVisionModel,) if is_tf_available() else () + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFGroupViTVisionModelTester(self) + self.config_tester = ConfigTester( + self, config_class=GroupViTVisionConfig, has_text_modality=False, hidden_size=37 + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="GroupViT does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + """ + During saving, TensorFlow will also run with `training=True` which trigger `gumbel_softmax` that requires + `tensorflow-probability`. + """ + + @require_tensorflow_probability + @slow + def test_saved_model_creation(self): + super().test_saved_model_creation() + + @unittest.skip(reason="GroupViT does not use inputs_embeds") + def test_graph_mode_with_inputs_embeds(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, keras.layers.Layer)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_len = getattr(self.model_tester, "seq_length", None) + + expected_num_attention_outputs = sum(g > 0 for g in self.model_tester.num_group_tokens) + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + attentions = outputs.attentions + # GroupViT returns attention grouping of each stage + self.assertEqual(len(attentions), sum(g > 0 for g in self.model_tester.num_group_tokens)) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + attentions = outputs.attentions + # GroupViT returns attention grouping of each stage + self.assertEqual(len(attentions), expected_num_attention_outputs) + + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + + added_hidden_states = 1 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.attentions + + # GroupViT returns attention grouping of each stage + self.assertEqual(len(self_attentions), expected_num_attention_outputs) + for i, self_attn in enumerate(self_attentions): + if self_attn is None: + continue + + self.assertListEqual( + list(self_attentions[i].shape[-2:]), + [ + self.model_tester.num_output_groups[i], + self.model_tester.num_output_groups[i - 1] if i > 0 else seq_len, + ], + ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + seq_length = getattr(self.model_tester, "seq_length", None) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + @slow + def test_model_from_pretrained(self): + model_name = "nvidia/groupvit-gcc-yfcc" + model = TFGroupViTVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip( + "TFGroupViTVisionModel does not convert `hidden_states` and `attentions` to tensors as they are all of" + " different dimensions, and we get `Got a non-Tensor value` error when saving the model." + ) + @slow + def test_saved_model_creation_extended(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + if hasattr(config, "use_cache"): + config.use_cache = True + + seq_len = getattr(self.model_tester, "seq_length", None) + + for model_class in self.all_model_classes: + class_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + num_out = len(model(class_inputs_dict)) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, saved_model=True) + saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") + model = keras.models.load_model(saved_model_dir) + outputs = model(class_inputs_dict) + output_hidden_states = outputs["hidden_states"] + output_attentions = outputs["attentions"] + + # Check num outputs + self.assertEqual(len(outputs), num_out) + + # Check num layers + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + + self.assertEqual(len(output_hidden_states), expected_num_layers) + self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers) + + # Check attention outputs + image_size = (self.model_tester.image_size, self.model_tester.image_size) + patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + seq_len = num_patches + 1 + + self.assertListEqual( + list(output_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_len, seq_len], + ) + + # Check hidden states + self.assertListEqual( + list(output_hidden_states[0].shape[-2:]), + [seq_len, self.model_tester.hidden_size], + ) + + +class TFGroupViTTextModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + rng = random.Random(0) + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, rng=rng) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + # make sure the first token has attention mask `1` to ensure that, after combining the causal mask, there + # is still at least one token being attended to for each batch. + # TODO: Change `random_attention_mask` in PT/TF/Flax common test file, after a discussion with the team. + input_mask = tf.concat( + [tf.ones_like(input_mask[:, :1], dtype=input_mask.dtype), input_mask[:, 1:]], axis=-1 + ) + + config = self.get_config() + + return config, input_ids, input_mask + + def get_config(self): + return GroupViTTextConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, input_ids, input_mask): + model = TFGroupViTTextModel(config=config) + result = model(input_ids, attention_mask=input_mask, training=False) + result = model(input_ids, training=False) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, input_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFGroupViTTextModelTest(TFModelTesterMixin, unittest.TestCase): + all_model_classes = (TFGroupViTTextModel,) if is_tf_available() else () + test_pruning = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFGroupViTTextModelTester(self) + self.config_tester = ConfigTester(self, config_class=GroupViTTextConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="GroupViTTextModel does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "nvidia/groupvit-gcc-yfcc" + model = TFGroupViTTextModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @slow + def test_saved_model_creation_extended(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + if hasattr(config, "use_cache"): + config.use_cache = True + + for model_class in self.all_model_classes: + class_inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + num_out = len(model(class_inputs_dict)) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, saved_model=True) + saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") + model = keras.models.load_model(saved_model_dir) + outputs = model(class_inputs_dict) + output_hidden_states = outputs["hidden_states"] + output_attentions = outputs["attentions"] + + # Check number of outputs + self.assertEqual(len(outputs), num_out) + + # Check number of layers + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + + # Check hidden states + self.assertEqual(len(output_hidden_states), expected_num_layers) + self.assertListEqual( + list(output_hidden_states[0].shape[-2:]), + [self.model_tester.seq_length, self.model_tester.hidden_size], + ) + + # Check attention outputs + self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers) + + seq_length = self.model_tester.seq_length + key_length = getattr(self.model_tester, "key_length", seq_length) + + self.assertListEqual( + list(output_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_length, key_length], + ) + + +class TFGroupViTModelTester: + def __init__(self, parent, is_training=True): + self.parent = parent + self.text_model_tester = TFGroupViTTextModelTester(parent) + self.vision_model_tester = TFGroupViTVisionModelTester(parent) + self.is_training = is_training + + def prepare_config_and_inputs(self): + text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, attention_mask, pixel_values + + def get_config(self): + return GroupViTConfig.from_text_vision_configs( + self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64 + ) + + def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): + model = TFGroupViTModel(config) + result = model(input_ids, pixel_values, attention_mask, training=False) + self.parent.assertEqual( + result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size) + ) + self.parent.assertEqual( + result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "return_loss": True, + } + return config, inputs_dict + + +@require_tf +class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (TFGroupViTModel,) if is_tf_available() else () + pipeline_model_mapping = {"feature-extraction": TFGroupViTModel} if is_tf_available() else {} + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + test_onnx = False + + def setUp(self): + self.model_tester = TFGroupViTModelTester(self) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="hidden_states are tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="input_embeds are tested in individual model tests") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="CLIPModel does not have input/output embeddings") + def test_model_common_attributes(self): + pass + + @require_tensorflow_probability + @slow + def test_keras_fit(self): + super().test_keras_fit() + + # overwrite from common since `TFGroupViTModelTester` set `return_loss` to `True` and causes the preparation of + # `symbolic_inputs` failed. + def test_keras_save_load(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # remove `return_loss` to make code work + if self.__class__.__name__ == "TFGroupViTModelTest": + inputs_dict.pop("return_loss", None) + + tf_main_layer_classes = { + module_member + for model_class in self.all_model_classes + for module in (import_module(model_class.__module__),) + for module_member_name in dir(module) + if module_member_name.endswith("MainLayer") + # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`. + and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] + for module_member in (getattr(module, module_member_name),) + if isinstance(module_member, type) + and keras.layers.Layer in module_member.__bases__ + and getattr(module_member, "_keras_serializable", False) + } + for main_layer_class in tf_main_layer_classes: + # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter + if "T5" in main_layer_class.__name__: + # Take the same values than in TFT5ModelTester for this shared layer + shared = TFSharedEmbeddings(99, 32, name="shared") + config.use_cache = inputs_dict.pop("use_cache", None) + main_layer = main_layer_class(config, embed_tokens=shared) + else: + main_layer = main_layer_class(config) + + symbolic_inputs = { + name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() + } + + model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) + outputs = model(inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + filepath = os.path.join(tmpdirname, "keras_model.h5") + model.save(filepath) + if "T5" in main_layer_class.__name__: + model = keras.models.load_model( + filepath, + custom_objects={ + main_layer_class.__name__: main_layer_class, + "TFSharedEmbeddings": TFSharedEmbeddings, + }, + ) + else: + model = keras.models.load_model( + filepath, custom_objects={main_layer_class.__name__: main_layer_class} + ) + assert isinstance(model, keras.Model) + after_outputs = model(inputs_dict) + self.assert_outputs_same(after_outputs, outputs) + + @slow + def test_model_from_pretrained(self): + model_name = "nvidia/groupvit-gcc-yfcc" + model = TFGroupViTModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.") + @slow + def test_saved_model_creation(self): + pass + + @unittest.skip(reason="`saved_model` doesn't work with nested outputs so no preparation happens.") + @slow + def test_prepare_serving_output(self): + pass + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@require_vision +@require_tf +class TFGroupViTModelIntegrationTest(unittest.TestCase): + @slow + def test_inference(self): + model_name = "nvidia/groupvit-gcc-yfcc" + model = TFGroupViTModel.from_pretrained(model_name) + processor = CLIPProcessor.from_pretrained(model_name) + + image = prepare_img() + inputs = processor( + text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="tf" + ) + + outputs = model(**inputs, training=False) + + # verify the logits + self.assertEqual( + outputs.logits_per_image.shape, + tf.TensorShape((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])), + ) + self.assertEqual( + outputs.logits_per_text.shape, + tf.TensorShape((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), + ) + + expected_logits = tf.constant([[13.3523, 6.3629]]) + + tf.debugging.assert_near(outputs.logits_per_image, expected_logits, atol=1e-3) diff --git a/docs/transformers/tests/models/helium/__init__.py b/docs/transformers/tests/models/helium/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/helium/test_modeling_helium.py b/docs/transformers/tests/models/helium/test_modeling_helium.py new file mode 100644 index 0000000000000000000000000000000000000000..f4a555588e93956c157347bc8b31415192b9a55c --- /dev/null +++ b/docs/transformers/tests/models/helium/test_modeling_helium.py @@ -0,0 +1,108 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Helium model.""" + +import unittest + +from transformers import AutoModelForCausalLM, AutoTokenizer, HeliumConfig, is_torch_available +from transformers.testing_utils import ( + require_read_token, + require_torch, + slow, + torch_device, +) + +from ...test_configuration_common import ConfigTester +from ..gemma.test_modeling_gemma import GemmaModelTest, GemmaModelTester + + +if is_torch_available(): + import torch + + from transformers import ( + HeliumForCausalLM, + HeliumForSequenceClassification, + HeliumForTokenClassification, + HeliumModel, + ) + + +class HeliumModelTester(GemmaModelTester): + if is_torch_available(): + config_class = HeliumConfig + model_class = HeliumModel + for_causal_lm_class = HeliumForCausalLM + for_sequence_class = HeliumForSequenceClassification + for_token_class = HeliumForTokenClassification + + +@require_torch +class HeliumModelTest(GemmaModelTest, unittest.TestCase): + all_model_classes = ( + (HeliumModel, HeliumForCausalLM, HeliumForSequenceClassification, HeliumForTokenClassification) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": HeliumModel, + "text-classification": HeliumForSequenceClassification, + "token-classification": HeliumForTokenClassification, + "text-generation": HeliumForCausalLM, + "zero-shot": HeliumForSequenceClassification, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + _is_stateful = True + model_split_percents = [0.5, 0.6] + + def setUp(self): + self.model_tester = HeliumModelTester(self) + self.config_tester = ConfigTester(self, config_class=HeliumConfig, hidden_size=37) + + +@slow +# @require_torch_gpu +class HeliumIntegrationTest(unittest.TestCase): + input_text = ["Hello, today is a great day to"] + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + @require_read_token + def test_model_2b(self): + model_id = "kyutai/helium-1-preview" + EXPECTED_TEXTS = [ + "Hello, today is a great day to start a new project. I have been working on a new project for a while now and I have" + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, revision="refs/pr/1" + ).to(torch_device) + tokenizer = AutoTokenizer.from_pretrained(model_id, revision="refs/pr/1") + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) diff --git a/docs/transformers/tests/models/herbert/__init__.py b/docs/transformers/tests/models/herbert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/herbert/test_tokenization_herbert.py b/docs/transformers/tests/models/herbert/test_tokenization_herbert.py new file mode 100644 index 0000000000000000000000000000000000000000..6bd95000d6200f2d5172f14172d495a51ae4a44a --- /dev/null +++ b/docs/transformers/tests/models/herbert/test_tokenization_herbert.py @@ -0,0 +1,141 @@ +# Copyright 2018 The Google AI Language Team Authors, Allegro.pl and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import unittest + +from transformers import HerbertTokenizer, HerbertTokenizerFast +from transformers.models.herbert.tokenization_herbert import VOCAB_FILES_NAMES +from transformers.testing_utils import get_tests_dir, require_sacremoses, require_tokenizers, slow + +from ...test_tokenization_common import TokenizerTesterMixin + + +@require_sacremoses +@require_tokenizers +class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "allegro/herbert-base-cased" + tokenizer_class = HerbertTokenizer + rust_tokenizer_class = HerbertTokenizerFast + test_rust_tokenizer = True + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # Use a simpler test file without japanese/chinese characters + with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data: + cls._data = f_data.read().replace("\n\n", "\n").strip() + + vocab = [ + "", + "", + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "w", + "r", + "t", + "lo", + "low", + "er", + "low", + "lowest", + "newer", + "wider", + ",", + "", + ] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["l o 123", "lo w 1456", "e r 1789", ""] + + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w") as fp: + fp.write(json.dumps(vocab_tokens)) + with open(cls.merges_file, "w") as fp: + fp.write("\n".join(merges)) + + def get_input_output_texts(self, tokenizer): + input_text = "lower newer" + output_text = "lower newer" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = self.tokenizer_class(vocab_file=self.vocab_file, merges_file=self.merges_file) + + text = "lower" + bpe_tokens = ["low", "er"] + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [""] + input_bpe_tokens = [16, 17, 23] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + def test_rust_and_python_full_tokenizers(self): + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + sequence = "lower,newer" + + tokens = tokenizer.tokenize(sequence) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(tokens, rust_tokens) + + ids = tokenizer.encode(sequence, add_special_tokens=False) + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + rust_tokenizer = self.get_rust_tokenizer() + ids = tokenizer.encode(sequence) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + @slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained("allegro/herbert-base-cased") + + text = tokenizer.encode("konstruowanie sekwencji", add_special_tokens=False) + text_2 = tokenizer.encode("konstruowanie wielu sekwencji", add_special_tokens=False) + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_sentence == [0] + text + [2] + assert encoded_pair == [0] + text + [2] + text_2 + [2] + + @unittest.skip( + "Test passes if run individually but not with the full tests (internal state of the tokenizer is modified). Will fix later" + ) + def test_training_new_tokenizer_with_special_tokens_change(self): + pass + + @unittest.skip( + "Test passes if run individually but not with the full tests (internal state of the tokenizer is modified). Will fix later" + ) + def test_training_new_tokenizer(self): + pass diff --git a/docs/transformers/tests/models/hiera/__init__.py b/docs/transformers/tests/models/hiera/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/hiera/test_modeling_hiera.py b/docs/transformers/tests/models/hiera/test_modeling_hiera.py new file mode 100644 index 0000000000000000000000000000000000000000..7874c78d82955fdbe0f1a719e31b306735f59a4d --- /dev/null +++ b/docs/transformers/tests/models/hiera/test_modeling_hiera.py @@ -0,0 +1,630 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Hiera model.""" + +import math +import unittest + +from transformers import HieraConfig +from transformers.testing_utils import ( + require_torch, + require_vision, + slow, + torch_device, +) +from transformers.utils import ( + cached_property, + is_torch_available, + is_vision_available, +) + +from ...test_backbone_common import BackboneTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import HieraBackbone, HieraForImageClassification, HieraForPreTraining, HieraModel + +if is_vision_available(): + from PIL import Image + + from transformers import AutoImageProcessor + + +class HieraModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=[64, 64], + mlp_ratio=1.0, + num_channels=3, + depths=[1, 1, 1, 1], + patch_stride=[4, 4], + patch_size=[7, 7], + patch_padding=[3, 3], + masked_unit_size=[8, 8], + num_heads=[1, 1, 1, 1], + embed_dim_multiplier=2.0, + is_training=True, + use_labels=True, + embed_dim=8, + hidden_act="gelu", + decoder_hidden_size=2, + decoder_depth=1, + decoder_num_heads=1, + initializer_range=0.02, + scope=None, + type_sequence_label_size=10, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.mlp_ratio = mlp_ratio + self.num_channels = num_channels + self.depths = depths + self.patch_stride = patch_stride + self.patch_size = patch_size + self.patch_padding = patch_padding + self.masked_unit_size = masked_unit_size + self.num_heads = num_heads + self.embed_dim_multiplier = embed_dim_multiplier + self.is_training = is_training + self.use_labels = use_labels + self.embed_dim = embed_dim + self.hidden_act = hidden_act + self.decoder_hidden_size = decoder_hidden_size + self.decoder_depth = decoder_depth + self.decoder_num_heads = decoder_num_heads + self.initializer_range = initializer_range + self.scope = scope + self.type_sequence_label_size = type_sequence_label_size + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return HieraConfig( + embed_dim=self.embed_dim, + image_size=self.image_size, + patch_stride=self.patch_stride, + patch_size=self.patch_size, + patch_padding=self.patch_padding, + masked_unit_size=self.masked_unit_size, + mlp_ratio=self.mlp_ratio, + num_channels=self.num_channels, + depths=self.depths, + num_heads=self.num_heads, + embed_dim_multiplier=self.embed_dim_multiplier, + hidden_act=self.hidden_act, + decoder_hidden_size=self.decoder_hidden_size, + decoder_depth=self.decoder_depth, + decoder_num_heads=self.decoder_num_heads, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = HieraModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + tokens_spatial_shape = [i // s for i, s in zip(self.image_size, config.patch_stride)] + expected_seq_len = math.prod(tokens_spatial_shape) // math.prod(config.query_stride) ** (config.num_query_pool) + expected_dim = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1)) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim)) + + def create_and_check_backbone(self, config, pixel_values, labels): + model = HieraBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify hidden states + self.parent.assertEqual(len(result.feature_maps), len(config.out_features)) + num_patches = config.image_size[0] // config.patch_stride[0] // config.masked_unit_size[0] + self.parent.assertListEqual( + list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], num_patches, num_patches] + ) + + # verify channels + self.parent.assertEqual(len(model.channels), len(config.out_features)) + + # verify backbone works with out_features=None + config.out_features = None + model = HieraBackbone(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + + # verify feature maps + self.parent.assertEqual(len(result.feature_maps), 1) + self.parent.assertListEqual( + list(result.feature_maps[0].shape), [self.batch_size, model.channels[-1], num_patches, num_patches] + ) + + # verify channels + self.parent.assertEqual(len(model.channels), 1) + + def create_and_check_for_pretraining(self, config, pixel_values, labels): + model = HieraForPreTraining(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + pred_stride = config.patch_stride[-1] * (config.query_stride[-1] ** config.num_query_pool) + num_patches = self.image_size[0] // pred_stride + self.parent.assertEqual( + result.logits.shape, (self.batch_size, num_patches**2, self.num_channels * pred_stride**2) + ) + + # test greyscale images + config.num_channels = 1 + model = HieraForPreTraining(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size[0], self.image_size[0]]) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches**2, pred_stride**2)) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = HieraForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + # test greyscale images + config.num_channels = 1 + model = HieraForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size[0], self.image_size[0]]) + result = model(pixel_values) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + pixel_values, + labels, + ) = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class HieraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Hiera does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + ( + HieraModel, + HieraBackbone, + HieraForImageClassification, + HieraForPreTraining, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + {"image-feature-extraction": HieraModel, "image-classification": HieraForImageClassification} + if is_torch_available() + else {} + ) + fx_compatible = True + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = HieraModelTester(self) + self.config_tester = ConfigTester(self, config_class=HieraConfig, has_text_modality=False) + + def test_config(self): + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + self.config_tester.check_config_arguments_init() + + # Overriding as Hiera `get_input_embeddings` returns HieraPatchEmbeddings + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + # Overriding as attention shape depends on patch_stride and mask_unit_size + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + expected_num_attentions = len(self.model_tester.depths) + self.assertEqual(len(attentions), expected_num_attentions) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + seq_len = math.prod([i // s for i, s in zip(config.image_size, config.patch_stride)]) + mask_unit_area = math.prod(config.masked_unit_size) + num_windows = seq_len // mask_unit_area + if model_class.__name__ == "HieraForPreTraining": + num_windows = int(num_windows * (1 - config.mask_ratio)) + seq_len = int(num_windows * mask_unit_area) + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + self.assertEqual(len(attentions), expected_num_attentions) + + self.assertListEqual( + list(attentions[0].shape[-4:]), + [self.model_tester.num_heads[0], num_windows, mask_unit_area, seq_len // num_windows], + ) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + # also another +1 for reshaped_hidden_states + added_hidden_states = 1 if model_class.__name__ == "HieraBackbone" else 2 + self.assertEqual(out_len + added_hidden_states, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), expected_num_attentions) + + self.assertListEqual( + list(self_attentions[0].shape[-4:]), + [self.model_tester.num_heads[0], num_windows, mask_unit_area, seq_len // num_windows], + ) + + # Overriding as attention shape depends on patch_stride and mask_unit_size + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class, image_size): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + # Hiera has a different seq_length + patch_size = config.patch_stride + + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + if model_class.__name__ == "HieraForPreTraining": + mask_unit_area = math.prod(config.masked_unit_size) + num_windows = num_patches // mask_unit_area + num_windows = int(num_windows * (1 - config.mask_ratio)) + num_patches = int(num_windows * mask_unit_area) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [num_patches, self.model_tester.embed_dim], + ) + + if not model_class.__name__ == "HieraBackbone": + reshaped_hidden_states = outputs.reshaped_hidden_states + self.assertEqual(len(reshaped_hidden_states), expected_num_layers) + + batch_size = reshaped_hidden_states[0].shape[0] + num_channels = reshaped_hidden_states[0].shape[-1] + + reshaped_hidden_states = reshaped_hidden_states[0].view(batch_size, -1, num_channels) + self.assertListEqual( + list(reshaped_hidden_states.shape[-2:]), + [num_patches, self.model_tester.embed_dim], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + image_size = self.model_tester.image_size + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class, image_size) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class, image_size) + + # Overriding since HieraForPreTraining outputs bool_masked_pos which has to be converted to float in the msg + def test_model_outputs_equivalence(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def set_nan_tensor_to_zero(t): + t[t != t] = 0 + return t + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + with torch.no_grad(): + tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (list, tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + torch.allclose( + set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 + ), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {torch.max(torch.abs(tuple_object.float() - dict_object.float()))}. Tuple has `nan`:" + f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has" + f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}." + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + additional_kwargs = {} + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + additional_kwargs["output_hidden_states"] = True + check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs) + + if self.has_attentions: + # Removing "output_hidden_states" + del additional_kwargs["output_hidden_states"] + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + additional_kwargs["output_attentions"] = True + check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + additional_kwargs["output_hidden_states"] = True + check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs) + + @unittest.skip(reason="Hiera Transformer does not use feedforward chunking") + def test_feed_forward_chunking(self): + pass + + @unittest.skip(reason="Hiera does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_backbone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_backbone(*config_and_inputs) + + def test_for_pretraining(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_pretraining(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in ["facebook/hiera-tiny-224-hf"]: + model = HieraModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class HieraModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-in1k-hf") if is_vision_available() else None + + @slow + def test_inference_image_classification_head(self): + model = HieraForImageClassification.from_pretrained("facebook/hiera-tiny-224-in1k-hf").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + expected_pixel_values = torch.tensor( + [ + [[0.2967, 0.4679, 0.4508], [0.3309, 0.4337, 0.3309], [0.3309, 0.3823, 0.3309]], + [[-1.5455, -1.4930, -1.5455], [-1.5280, -1.4755, -1.5980], [-1.5630, -1.5280, -1.4755]], + [[-0.6367, -0.4973, -0.5321], [-0.7936, -0.6715, -0.6715], [-0.8284, -0.7413, -0.5670]], + ] + ).to(torch_device) + + torch.testing.assert_close(inputs.pixel_values[0, :3, :3, :3], expected_pixel_values, rtol=1e-4, atol=1e-4) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor([[0.8028, 0.2409, -0.2254, -0.3712, -0.2848]]).to(torch_device) + + torch.testing.assert_close(outputs.logits[0, :5], expected_slice, rtol=1e-4, atol=1e-4) + + def test_inference_interpolate_pos_encoding(self): + model = HieraModel.from_pretrained("facebook/hiera-tiny-224-hf").to(torch_device) + + image_processor = AutoImageProcessor.from_pretrained( + "facebook/hiera-tiny-224-hf", size={"shortest_edge": 448}, crop_size={"height": 448, "width": 448} + ) + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt") + pixel_values = inputs.pixel_values.to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(pixel_values, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 196, 768)) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[1.7853, 0.0690, 0.3177], [2.6853, -0.2334, 0.0889], [1.5445, -0.1515, -0.0300]] + ).to(torch_device) + + torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + @slow + def test_inference_for_pretraining(self): + # make random mask reproducible + torch.manual_seed(2) + + model = HieraForPreTraining.from_pretrained("facebook/hiera-tiny-224-mae-hf").to(torch_device) + image_processor = self.default_image_processor + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + config = model.config + mask_spatial_shape = [ + i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size) + ] + num_windows = math.prod(mask_spatial_shape) + noise = torch.rand(1, num_windows).to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs, noise=noise) + + # verify the logits + expected_shape = torch.Size((1, 196, 768)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor( + [ + [1.6407, 1.6506, 1.6541, 1.6617, 1.6703], + [1.9730, 1.9842, 1.9848, 1.9896, 1.9947], + [1.5949, 1.8262, 1.2602, 1.4801, 1.4448], + [1.2341, 1.7907, 0.8618, 1.5202, 1.4523], + [2.0140, 1.9846, 1.9434, 1.9019, 1.8648], + ] + ) + + torch.testing.assert_close(outputs.logits[0, :5, :5], expected_slice.to(torch_device), rtol=1e-4, atol=1e-4) + + +@require_torch +class HieraBackboneTest(unittest.TestCase, BackboneTesterMixin): + all_model_classes = (HieraBackbone,) if is_torch_available() else () + config_class = HieraConfig + + def setUp(self): + self.model_tester = HieraModelTester(self) diff --git a/docs/transformers/tests/models/hubert/__init__.py b/docs/transformers/tests/models/hubert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/hubert/test_modeling_hubert.py b/docs/transformers/tests/models/hubert/test_modeling_hubert.py new file mode 100644 index 0000000000000000000000000000000000000000..c4e65cfa5c6b8282559772a20fc1a62e78917048 --- /dev/null +++ b/docs/transformers/tests/models/hubert/test_modeling_hubert.py @@ -0,0 +1,979 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Hubert model.""" + +import math +import os +import pickle +import tempfile +import unittest + +import pytest + +from transformers import HubertConfig, is_torch_available +from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + _config_zero_init, + floats_tensor, + ids_tensor, + random_attention_mask, +) +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + HubertForCTC, + HubertForSequenceClassification, + HubertModel, + Wav2Vec2FeatureExtractor, + Wav2Vec2Processor, + ) + from transformers.models.hubert.modeling_hubert import _compute_mask_indices + +from transformers.utils.fx import symbolic_trace + + +class HubertModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=1024, # speech is longer + is_training=False, + hidden_size=16, + feat_extract_norm="group", + feat_extract_dropout=0.0, + feat_extract_activation="gelu", + conv_dim=(32, 32, 32), + conv_stride=(4, 4, 4), + conv_kernel=(8, 8, 8), + conv_bias=False, + num_conv_pos_embeddings=16, + num_conv_pos_embedding_groups=2, + num_hidden_layers=2, + num_attention_heads=2, + hidden_dropout_prob=0.1, # this is most likely not correctly set yet + intermediate_size=20, + layer_norm_eps=1e-5, + hidden_act="gelu", + initializer_range=0.02, + vocab_size=32, + do_stable_layer_norm=False, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.hidden_size = hidden_size + self.feat_extract_norm = feat_extract_norm + self.feat_extract_dropout = feat_extract_dropout + self.feat_extract_activation = feat_extract_activation + self.conv_dim = conv_dim + self.conv_stride = conv_stride + self.conv_kernel = conv_kernel + self.conv_bias = conv_bias + self.num_conv_pos_embeddings = num_conv_pos_embeddings + self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_dropout_prob = hidden_dropout_prob + self.intermediate_size = intermediate_size + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.vocab_size = vocab_size + self.do_stable_layer_norm = do_stable_layer_norm + self.scope = scope + + output_seq_length = self.seq_length + for kernel, stride in zip(self.conv_kernel, self.conv_stride): + output_seq_length = (output_seq_length - (kernel - 1)) / stride + self.output_seq_length = int(math.ceil(output_seq_length)) + self.encoder_seq_length = self.output_seq_length + + def prepare_config_and_inputs(self): + input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0) + attention_mask = random_attention_mask([self.batch_size, self.seq_length]) + + config = self.get_config() + + return config, input_values, attention_mask + + def get_config(self): + return HubertConfig( + hidden_size=self.hidden_size, + feat_extract_norm=self.feat_extract_norm, + feat_extract_dropout=self.feat_extract_dropout, + feat_extract_activation=self.feat_extract_activation, + conv_dim=self.conv_dim, + conv_stride=self.conv_stride, + conv_kernel=self.conv_kernel, + conv_bias=self.conv_bias, + num_conv_pos_embeddings=self.num_conv_pos_embeddings, + num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + hidden_dropout_prob=self.hidden_dropout_prob, + intermediate_size=self.intermediate_size, + layer_norm_eps=self.layer_norm_eps, + hidden_act=self.hidden_act, + initializer_range=self.initializer_range, + vocab_size=self.vocab_size, + do_stable_layer_norm=self.do_stable_layer_norm, + ) + + def create_and_check_model(self, config, input_values, attention_mask): + model = HubertModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_values, attention_mask=attention_mask) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size) + ) + + def create_and_check_batch_inference(self, config, input_values, *args): + # test does not pass for models making use of `group_norm` + # check: https://github.com/pytorch/fairseq/issues/3227 + model = HubertModel(config=config) + model.to(torch_device) + model.eval() + + input_values = input_values[:3] + attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool) + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + + # pad input + for i in range(len(input_lengths)): + input_values[i, input_lengths[i] :] = 0.0 + attention_mask[i, input_lengths[i] :] = 0.0 + + batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state + + for i in range(input_values.shape[0]): + input_slice = input_values[i : i + 1, : input_lengths[i]] + output = model(input_slice).last_hidden_state + + batch_output = batch_outputs[i : i + 1, : output.shape[1]] + self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3)) + + def check_ctc_loss(self, config, input_values, *args): + model = HubertForCTC(config=config) + model.to(torch_device) + + # make sure that dropout is disabled + model.eval() + + input_values = input_values[:3] + attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long) + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) + labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size) + + # pad input + for i in range(len(input_lengths)): + input_values[i, input_lengths[i] :] = 0.0 + attention_mask[i, input_lengths[i] :] = 0 + + model.config.ctc_loss_reduction = "sum" + sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() + + model.config.ctc_loss_reduction = "mean" + mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() + + self.parent.assertTrue(isinstance(sum_loss, float)) + self.parent.assertTrue(isinstance(mean_loss, float)) + + def check_seq_classifier_loss(self, config, input_values, *args): + model = HubertForSequenceClassification(config=config) + model.to(torch_device) + + # make sure that dropout is disabled + model.eval() + + input_values = input_values[:3] + attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long) + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label)) + + # pad input + for i in range(len(input_lengths)): + input_values[i, input_lengths[i] :] = 0.0 + attention_mask[i, input_lengths[i] :] = 0 + + masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() + unmasked_loss = model(input_values, labels=labels).loss.item() + + self.parent.assertTrue(isinstance(masked_loss, float)) + self.parent.assertTrue(isinstance(unmasked_loss, float)) + self.parent.assertTrue(masked_loss != unmasked_loss) + + def check_ctc_training(self, config, input_values, *args): + config.ctc_zero_infinity = True + model = HubertForCTC(config=config) + model.to(torch_device) + model.train() + + # freeze feature encoder + model.freeze_feature_encoder() + + input_values = input_values[:3] + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) + labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size) + + # pad input + for i in range(len(input_lengths)): + input_values[i, input_lengths[i] :] = 0.0 + + if max_length_labels[i] < labels.shape[-1]: + # it's important that we make sure that target lengths are at least + # one shorter than logit lengths to prevent -inf + labels[i, max_length_labels[i] - 1 :] = -100 + + loss = model(input_values, labels=labels).loss + self.parent.assertFalse(torch.isinf(loss).item()) + + loss.backward() + + def check_seq_classifier_training(self, config, input_values, *args): + config.ctc_zero_infinity = True + model = HubertForSequenceClassification(config=config) + model.to(torch_device) + model.train() + + # freeze everything but the classification head + model.freeze_base_model() + + input_values = input_values[:3] + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label)) + + # pad input + for i in range(len(input_lengths)): + input_values[i, input_lengths[i] :] = 0.0 + + loss = model(input_values, labels=labels).loss + self.parent.assertFalse(torch.isinf(loss).item()) + + loss.backward() + + def check_labels_out_of_vocab(self, config, input_values, *args): + model = HubertForCTC(config) + model.to(torch_device) + model.train() + + input_values = input_values[:3] + + input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] + max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) + labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100) + + with pytest.raises(ValueError): + model(input_values, labels=labels) + + def prepare_config_and_inputs_for_common(self): + config, input_values, attention_mask = self.prepare_config_and_inputs() + inputs_dict = {"input_values": input_values, "attention_mask": attention_mask} + return config, inputs_dict + + +@require_torch +class HubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (HubertForCTC, HubertForSequenceClassification, HubertModel) if is_torch_available() else () + pipeline_model_mapping = ( + { + "audio-classification": HubertForSequenceClassification, + "automatic-speech-recognition": HubertForCTC, + "feature-extraction": HubertModel, + } + if is_torch_available() + else {} + ) + fx_compatible = True + test_pruning = False + test_headmasking = False + + def setUp(self): + self.model_tester = HubertModelTester(self) + self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_ctc_loss_inference(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_ctc_loss(*config_and_inputs) + + def test_seq_classifier_loss_inference(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_seq_classifier_loss(*config_and_inputs) + + def test_ctc_train(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_ctc_training(*config_and_inputs) + + def test_seq_classifier_train(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_seq_classifier_training(*config_and_inputs) + + def test_labels_out_of_vocab(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_labels_out_of_vocab(*config_and_inputs) + + @unittest.skip(reason="Hubert has no inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Hubert has no inputs_embeds") + def test_forward_signature(self): + pass + + # Hubert cannot resize token embeddings + # since it has no tokens embeddings + @unittest.skip(reason="Hubert has no tokens embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Hubert has no inputs_embeds") + def test_model_get_set_embeddings(self): + pass + + def test_retain_grad_hidden_states_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + # set layer drop to 0 + model.config.layerdrop = 0.0 + + input_values = inputs_dict["input_values"] + + input_lengths = torch.tensor( + [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device + ) + output_lengths = model._get_feat_extract_output_lengths(input_lengths) + + labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size) + inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"]) + inputs_dict["labels"] = labels + + outputs = model(**inputs_dict) + + output = outputs[0] + + # Encoder-/Decoder-only models + hidden_states = outputs.hidden_states[0] + attentions = outputs.attentions[0] + + hidden_states.retain_grad() + attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(hidden_states.grad) + self.assertIsNotNone(attentions.grad) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + uniform_init_parms = [ + "conv.weight", + "conv.parametrizations.weight", + "masked_spec_embed", + "quantizer.weight_proj.weight", + ] + if param.requires_grad: + if any(x in name for x in uniform_init_parms): + self.assertTrue( + -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + # Hubert cannot be TorchScripted because of torch.nn.utils.weight_norm + def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False): + # TODO: fix it + self.skipTest(reason="torch 2.1 breaks torch fx tests for wav2vec2/hubert.") + + if not self.fx_compatible: + self.skipTest(reason="torch fx is not compatible with this model") + + configs_no_init = _config_zero_init(config) # To be sure we have no Nan + configs_no_init.return_dict = False + + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + model.to(torch_device) + model.eval() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=output_loss) + + try: + if model.config.is_encoder_decoder: + model.config.use_cache = False # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward + labels = inputs.get("labels", None) + input_names = [ + "attention_mask", + "decoder_attention_mask", + "decoder_input_ids", + "input_features", + "input_ids", + "input_values", + ] + if labels is not None: + input_names.append("labels") + + filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names} + input_names = list(filtered_inputs.keys()) + + model_output = model(**filtered_inputs) + + traced_model = symbolic_trace(model, input_names) + traced_output = traced_model(**filtered_inputs) + else: + input_names = [ + "attention_mask", + "bbox", + "input_features", + "input_ids", + "input_values", + "pixel_values", + "token_type_ids", + "visual_feats", + "visual_pos", + ] + + labels = inputs.get("labels", None) + start_positions = inputs.get("start_positions", None) + end_positions = inputs.get("end_positions", None) + if labels is not None: + input_names.append("labels") + if start_positions is not None: + input_names.append("start_positions") + if end_positions is not None: + input_names.append("end_positions") + + filtered_inputs = {k: v for (k, v) in inputs.items() if k in input_names} + input_names = list(filtered_inputs.keys()) + + model_output = model(**filtered_inputs) + + traced_model = symbolic_trace(model, input_names) + traced_output = traced_model(**filtered_inputs) + + except Exception as e: + self.fail(f"Couldn't trace module: {e}") + + def flatten_output(output): + flatten = [] + for x in output: + if isinstance(x, (tuple, list)): + flatten += flatten_output(x) + elif not isinstance(x, torch.Tensor): + continue + else: + flatten.append(x) + return flatten + + model_output = flatten_output(model_output) + traced_output = flatten_output(traced_output) + num_outputs = len(model_output) + + for i in range(num_outputs): + self.assertTrue( + torch.allclose(model_output[i], traced_output[i]), + f"traced {i}th output doesn't match model {i}th output for {model_class}", + ) + + # Test that the model can be serialized and restored properly + with tempfile.TemporaryDirectory() as tmp_dir_name: + pkl_file_name = os.path.join(tmp_dir_name, "model.pkl") + try: + with open(pkl_file_name, "wb") as f: + pickle.dump(traced_model, f) + with open(pkl_file_name, "rb") as f: + loaded = pickle.load(f) + except Exception as e: + self.fail(f"Couldn't serialize / deserialize the traced model: {e}") + + loaded_output = loaded(**filtered_inputs) + loaded_output = flatten_output(loaded_output) + + for i in range(num_outputs): + self.assertTrue( + torch.allclose(model_output[i], loaded_output[i]), + f"serialized model {i}th output doesn't match model {i}th output for {model_class}", + ) + + # overwrite from test_modeling_common + def _mock_init_weights(self, module): + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(3) + if hasattr(module, "weight_g") and module.weight_g is not None: + module.weight_g.data.fill_(3) + if hasattr(module, "weight_v") and module.weight_v is not None: + module.weight_v.data.fill_(3) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.fill_(3) + if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None: + module.masked_spec_embed.data.fill_(3) + + @unittest.skip(reason="Feed forward chunking is not implemented") + def test_feed_forward_chunking(self): + pass + + @slow + def test_model_from_pretrained(self): + model = HubertModel.from_pretrained("facebook/hubert-base-ls960") + self.assertIsNotNone(model) + + +@require_torch +class HubertRobustModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (HubertForCTC, HubertForSequenceClassification, HubertModel) if is_torch_available() else () + test_pruning = False + test_headmasking = False + + def setUp(self): + self.model_tester = HubertModelTester( + self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True + ) + self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_batched_inference(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_batch_inference(*config_and_inputs) + + def test_ctc_loss_inference(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_ctc_loss(*config_and_inputs) + + def test_seq_classifier_loss_inference(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_seq_classifier_loss(*config_and_inputs) + + def test_ctc_train(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_ctc_training(*config_and_inputs) + + def test_seq_classifier_train(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_seq_classifier_training(*config_and_inputs) + + def test_labels_out_of_vocab(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_labels_out_of_vocab(*config_and_inputs) + + @unittest.skip(reason="Hubert has no inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Hubert has input_values instead of input_ids") + def test_forward_signature(self): + pass + + @unittest.skip(reason="Hubert has no tokens embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Hubert has no inputs_embeds") + def test_model_get_set_embeddings(self): + pass + + def test_retain_grad_hidden_states_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + # set layer drop to 0 + model.config.layerdrop = 0.0 + + input_values = inputs_dict["input_values"] + + input_lengths = torch.tensor( + [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=torch.long, device=torch_device + ) + output_lengths = model._get_feat_extract_output_lengths(input_lengths) + + labels = ids_tensor((input_values.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size) + inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["attention_mask"]) + inputs_dict["labels"] = labels + + outputs = model(**inputs_dict) + + output = outputs[0] + + # Encoder-/Decoder-only models + hidden_states = outputs.hidden_states[0] + attentions = outputs.attentions[0] + + hidden_states.retain_grad() + attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(hidden_states.grad) + self.assertIsNotNone(attentions.grad) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + uniform_init_parms = [ + "conv.weight", + "conv.parametrizations.weight", + "masked_spec_embed", + "quantizer.weight_proj.weight", + ] + if param.requires_grad: + if any(x in name for x in uniform_init_parms): + self.assertTrue( + -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + # overwrite from test_modeling_common + def _mock_init_weights(self, module): + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(3) + if hasattr(module, "weight_g") and module.weight_g is not None: + module.weight_g.data.fill_(3) + if hasattr(module, "weight_v") and module.weight_v is not None: + module.weight_v.data.fill_(3) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.fill_(3) + if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None: + module.masked_spec_embed.data.fill_(3) + + @unittest.skip(reason="Feed forward chunking is not implemented") + def test_feed_forward_chunking(self): + pass + + @slow + def test_model_from_pretrained(self): + model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") + self.assertIsNotNone(model) + + +@require_torch +class HubertUtilsTest(unittest.TestCase): + def test_compute_mask_indices(self): + batch_size = 4 + sequence_length = 60 + mask_prob = 0.5 + mask_length = 1 + + mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length) + mask = torch.from_numpy(mask).to(torch_device) + + self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)]) + + def test_compute_mask_indices_overlap(self): + batch_size = 4 + sequence_length = 80 + mask_prob = 0.5 + mask_length = 4 + + mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length) + mask = torch.from_numpy(mask).to(torch_device) + + # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal + for batch_sum in mask.sum(axis=-1): + self.assertTrue(int(batch_sum) <= mask_prob * sequence_length) + + +@require_torch +@require_soundfile +@slow +class HubertModelIntegrationTest(unittest.TestCase): + def _load_datasamples(self, num_samples): + from datasets import load_dataset + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").filter( + lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] + )[:num_samples]["audio"] + + return [x["array"] for x in speech_samples] + + def _load_superb(self, task, num_samples): + from datasets import load_dataset + + ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True) + + return ds[:num_samples] + + def test_inference_ctc_batched(self): + model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft", torch_dtype=torch.float16).to( + torch_device + ) + processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft", do_lower_case=True) + + input_speech = self._load_datasamples(2) + + inputs = processor(input_speech, return_tensors="pt", padding=True) + + input_values = inputs.input_values.half().to(torch_device) + attention_mask = inputs.attention_mask.to(torch_device) + + with torch.no_grad(): + logits = model(input_values, attention_mask=attention_mask).logits + + predicted_ids = torch.argmax(logits, dim=-1) + predicted_trans = processor.batch_decode(predicted_ids) + + EXPECTED_TRANSCRIPTIONS = [ + "a man said to the universe sir i exist", + "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", + ] + self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS) + + def test_inference_keyword_spotting(self): + model = HubertForSequenceClassification.from_pretrained( + "superb/hubert-base-superb-ks", torch_dtype=torch.float16 + ).to(torch_device) + processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-ks") + input_data = self._load_superb("ks", 4) + inputs = processor(input_data["speech"], return_tensors="pt", padding=True) + + input_values = inputs.input_values.half().to(torch_device) + attention_mask = inputs.attention_mask.to(torch_device) + with torch.no_grad(): + outputs = model(input_values, attention_mask=attention_mask) + predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1) + + expected_labels = [2, 6, 10, 9] + # s3prl logits for the same batch + expected_logits = torch.tensor([7.6692, 17.7795, 11.1562, 11.8232], dtype=torch.float16, device=torch_device) + + self.assertListEqual(predicted_ids.tolist(), expected_labels) + torch.testing.assert_close(predicted_logits, expected_logits, rtol=3e-2, atol=3e-2) + + def test_inference_intent_classification(self): + model = HubertForSequenceClassification.from_pretrained( + "superb/hubert-base-superb-ic", torch_dtype=torch.float16 + ).to(torch_device) + processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-ic") + input_data = self._load_superb("ic", 4) + inputs = processor(input_data["speech"], return_tensors="pt", padding=True) + + input_values = inputs.input_values.half().to(torch_device) + attention_mask = inputs.attention_mask.to(torch_device) + with torch.no_grad(): + outputs = model(input_values, attention_mask=attention_mask) + + predicted_logits_action, predicted_ids_action = torch.max(outputs.logits[:, :6], dim=-1) + predicted_logits_object, predicted_ids_object = torch.max(outputs.logits[:, 6:20], dim=-1) + predicted_logits_location, predicted_ids_location = torch.max(outputs.logits[:, 20:24], dim=-1) + + expected_labels_action = [1, 0, 4, 3] + expected_logits_action = torch.tensor( + [5.9052, 12.5865, 4.4840, 10.0240], dtype=torch.float16, device=torch_device + ) + expected_labels_object = [1, 10, 3, 4] + expected_logits_object = torch.tensor( + [5.5316, 11.7946, 8.1672, 23.2415], dtype=torch.float16, device=torch_device + ) + expected_labels_location = [0, 0, 0, 1] + expected_logits_location = torch.tensor( + [5.2053, 8.9577, 10.0447, 8.1481], dtype=torch.float16, device=torch_device + ) + + self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action) + self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object) + self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location) + + # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572 + torch.testing.assert_close(predicted_logits_action, expected_logits_action, rtol=3e-1, atol=3e-1) + torch.testing.assert_close(predicted_logits_object, expected_logits_object, rtol=3e-1, atol=3e-1) + torch.testing.assert_close(predicted_logits_location, expected_logits_location, rtol=3e-1, atol=3e-1) + + def test_inference_speaker_identification(self): + model = HubertForSequenceClassification.from_pretrained( + "superb/hubert-base-superb-sid", torch_dtype=torch.float16 + ).to(torch_device) + processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-sid") + input_data = self._load_superb("si", 4) + + output_logits = [] + with torch.no_grad(): + for example in input_data["speech"]: + input = processor(example, return_tensors="pt", padding=True) + output = model(input.input_values.half().to(torch_device), attention_mask=None) + output_logits.append(output.logits[0]) + output_logits = torch.stack(output_logits) + predicted_logits, predicted_ids = torch.max(output_logits, dim=-1) + + expected_labels = [5, 1, 1, 3] + # s3prl logits for the same batch + expected_logits = torch.tensor( + [78231.5547, 123166.6094, 122785.4141, 84851.2969], dtype=torch.float16, device=torch_device + ) + + self.assertListEqual(predicted_ids.tolist(), expected_labels) + # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572 + torch.testing.assert_close(predicted_logits, expected_logits, rtol=10, atol=10) + + def test_inference_emotion_recognition(self): + model = HubertForSequenceClassification.from_pretrained( + "superb/hubert-base-superb-er", torch_dtype=torch.float16 + ).to(torch_device) + processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-er") + input_data = self._load_superb("er", 4) + inputs = processor(input_data["speech"], return_tensors="pt", padding=True) + + input_values = inputs.input_values.half().to(torch_device) + attention_mask = inputs.attention_mask.to(torch_device) + with torch.no_grad(): + outputs = model(input_values, attention_mask=attention_mask) + predicted_logits, predicted_ids = torch.max(outputs.logits, dim=-1) + + expected_labels = [1, 1, 2, 2] + # s3prl logits for the same batch + expected_logits = torch.tensor([2.8384, 2.3389, 3.8564, 4.5558], dtype=torch.float16, device=torch_device) + + self.assertListEqual(predicted_ids.tolist(), expected_labels) + # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572 + torch.testing.assert_close(predicted_logits, expected_logits, rtol=1e-1, atol=1e-1) + + def test_inference_distilhubert(self): + model = HubertModel.from_pretrained("ntu-spml/distilhubert").to(torch_device) + processor = Wav2Vec2FeatureExtractor.from_pretrained("ntu-spml/distilhubert") + + # TODO: can't test on batched inputs due to incompatible padding https://github.com/pytorch/fairseq/pull/3572 + input_speech = self._load_datasamples(1) + + inputs = processor(input_speech, return_tensors="pt", padding=True) + + input_values = inputs.input_values.to(torch_device) + + with torch.no_grad(): + outputs = model(input_values).last_hidden_state + + # expected outputs taken from the original SEW implementation + expected_outputs_first = torch.tensor( + [ + [ + [-0.3505, 0.1167, 0.0608, 0.1294], + [-0.3085, 0.0481, 0.1106, 0.0955], + [-0.3107, -0.0391, 0.0739, 0.1360], + [-0.2385, -0.1795, -0.0928, 0.2389], + ] + ], + device=torch_device, + ) + expected_outputs_last = torch.tensor( + [ + [ + [-0.0732, 0.0255, 0.0529, -0.1372], + [-0.0812, 0.1259, 0.0564, -0.0438], + [-0.0054, 0.0758, -0.0002, -0.1617], + [0.0133, -0.0320, -0.0687, 0.0062], + ] + ], + device=torch_device, + ) + expected_output_sum = -3776.0730 + + torch.testing.assert_close(outputs[:, :4, :4], expected_outputs_first, rtol=5e-3, atol=5e-3) + torch.testing.assert_close(outputs[:, -4:, -4:], expected_outputs_last, rtol=5e-3, atol=5e-3) + self.assertTrue(abs(outputs.sum() - expected_output_sum) < 0.1) + + def test_inference_hubert_25hz(self): + model = HubertModel.from_pretrained("slprl/mhubert-base-25hz").to(torch_device) + + sample = self._load_datasamples(1) + input_speech = torch.tensor(sample[0], dtype=torch.float, device=torch_device).unsqueeze(0) + + with torch.no_grad(): + outputs = model(input_speech, output_hidden_states=True).hidden_states[11] + + # expected outputs taken from the original textlesslib implementation by: + # model = SpeechEncoder.by_name(dense_model_name='mhubert-base-25hz', quantizer_model_name='kmeans', + # vocab_size=500, deduplicate=False, need_f0=False) + # model(wav)['dense'] + expected_outputs_first = torch.tensor( + [ + [0.0267, 0.1776, -0.1706, -0.4559], + [-0.2430, -0.2943, -0.1864, -0.1187], + [-0.1812, -0.4239, -0.1916, -0.0858], + [-0.1495, -0.4758, -0.4036, 0.0302], + ], + device=torch_device, + ) + expected_outputs_last = torch.tensor( + [ + [0.3366, -0.2734, -0.1415, -0.3055], + [0.2329, -0.3580, -0.1421, -0.3197], + [0.1631, -0.4301, -0.1965, -0.2956], + [0.3342, -0.2185, -0.2253, -0.2363], + ], + device=torch_device, + ) + expected_output_sum = 1681.7603 + + torch.testing.assert_close(outputs[:, :4, :4], expected_outputs_first, rtol=5e-3, atol=5e-3) + torch.testing.assert_close(outputs[:, -4:, -4:], expected_outputs_last, rtol=5e-3, atol=5e-3) + self.assertTrue(abs(outputs.sum() - expected_output_sum) < 0.1) diff --git a/docs/transformers/tests/models/hubert/test_modeling_tf_hubert.py b/docs/transformers/tests/models/hubert/test_modeling_tf_hubert.py new file mode 100644 index 0000000000000000000000000000000000000000..8d377ae8855c6e7cfc0eceb1d6ab5a858951f9bf --- /dev/null +++ b/docs/transformers/tests/models/hubert/test_modeling_tf_hubert.py @@ -0,0 +1,562 @@ +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import copy +import inspect +import math +import unittest + +import numpy as np +import pytest + +from transformers import is_tf_available +from transformers.testing_utils import require_soundfile, require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import HubertConfig, TFHubertForCTC, TFHubertModel, Wav2Vec2Processor + from transformers.models.hubert.modeling_tf_hubert import _compute_mask_indices + + +@require_tf +class TFHubertModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=1024, + is_training=False, + hidden_size=16, + feat_extract_norm="group", + feat_extract_dropout=0.0, + feat_extract_activation="gelu", + conv_dim=(32, 32, 32), + conv_stride=(4, 4, 4), + conv_kernel=(8, 8, 8), + conv_bias=False, + num_conv_pos_embeddings=16, + num_conv_pos_embedding_groups=2, + num_hidden_layers=2, + num_attention_heads=2, + hidden_dropout_prob=0.1, # this is most likely not correctly set yet + intermediate_size=20, + layer_norm_eps=1e-5, + hidden_act="gelu", + initializer_range=0.02, + vocab_size=32, + do_stable_layer_norm=False, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.hidden_size = hidden_size + self.feat_extract_norm = feat_extract_norm + self.feat_extract_dropout = feat_extract_dropout + self.feat_extract_activation = feat_extract_activation + self.conv_dim = conv_dim + self.conv_stride = conv_stride + self.conv_kernel = conv_kernel + self.conv_bias = conv_bias + self.num_conv_pos_embeddings = num_conv_pos_embeddings + self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_dropout_prob = hidden_dropout_prob + self.intermediate_size = intermediate_size + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.vocab_size = vocab_size + self.do_stable_layer_norm = do_stable_layer_norm + self.scope = scope + + output_seq_length = self.seq_length + for kernel, stride in zip(self.conv_kernel, self.conv_stride): + output_seq_length = (output_seq_length - (kernel - 1)) / stride + self.output_seq_length = int(math.ceil(output_seq_length)) + self.encoder_seq_length = self.output_seq_length + + def prepare_config_and_inputs(self): + input_values = tf.cast(ids_tensor([self.batch_size, self.seq_length], 32768), tf.float32) / 32768.0 + attention_mask = tf.ones_like(input_values) + + config = HubertConfig( + hidden_size=self.hidden_size, + feat_extract_norm=self.feat_extract_norm, + feat_extract_dropout=self.feat_extract_dropout, + feat_extract_activation=self.feat_extract_activation, + conv_dim=self.conv_dim, + conv_stride=self.conv_stride, + conv_kernel=self.conv_kernel, + conv_bias=self.conv_bias, + num_conv_pos_embeddings=self.num_conv_pos_embeddings, + num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + hidden_dropout_prob=self.hidden_dropout_prob, + intermediate_size=self.intermediate_size, + layer_norm_eps=self.layer_norm_eps, + hidden_act=self.hidden_act, + initializer_range=self.initializer_range, + vocab_size=self.vocab_size, + do_stable_layer_norm=self.do_stable_layer_norm, + ) + + return config, input_values, attention_mask + + def create_and_check_model(self, config, input_values, attention_mask): + model = TFHubertModel(config) + result = model(input_values, attention_mask=attention_mask) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size) + ) + + def create_and_check_batch_inference(self, config, input_values, *args): + # test does not pass for models making use of `group_norm` + # check: https://github.com/pytorch/fairseq/issues/3227 + config.layerdrop = 0.0 + model = TFHubertModel(config) + + input_values = input_values[:3] + attention_mask = tf.ones_like(input_values) + + input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]]) + length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32) + + # convert values that are over input_lengths to padding + input_values = input_values * length_mask + attention_mask = attention_mask * length_mask + + batch_outputs = model(input_values, attention_mask=attention_mask, training=False).last_hidden_state + + for i in range(input_values.shape[0]): + input_slice = input_values[i : i + 1, : input_lengths[i]] + output = model(input_slice, training=False).last_hidden_state + + batch_output = batch_outputs[i : i + 1, : output.shape[1]] + self.parent.assertTrue(np.allclose(output, batch_output, atol=1e-3)) + + def check_ctc_loss(self, config, input_values, *args): + model = TFHubertForCTC(config) + + input_values = input_values[:3] + attention_mask = tf.ones_like(input_values) + + input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]]) + max_length_labels = model.hubert._get_feat_extract_output_lengths(input_lengths) + labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size) + + length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32) + + # convert values that are over input_lengths to padding + input_values = input_values * length_mask + attention_mask = attention_mask * length_mask + + model.config.ctc_loss_reduction = "sum" + sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss + + model.config.ctc_loss_reduction = "mean" + mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss + + self.parent.assertTrue(abs(labels.shape[0] * mean_loss - sum_loss) < 1e-2) + + def check_training(self, config, input_values, *args): + model = TFHubertForCTC(config) + + # freeze feature encoder + model.freeze_feature_encoder() + + input_values = input_values[:3] + + input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]]) + max_length_labels = model.hubert._get_feat_extract_output_lengths(input_lengths) + labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size) + + length_mask = tf.sequence_mask(input_lengths, dtype=tf.float32) + + input_values = input_values * length_mask + + pad_size = max(max_length_labels) - labels.shape[1] + labels = tf.pad(labels, ((0, 0), (0, pad_size)), constant_values=-100) + + loss = model(input_values, labels=labels, training=True).loss + + self.parent.assertFalse(tf.math.is_inf(loss)) + + def check_labels_out_of_vocab(self, config, input_values, *args): + model = TFHubertForCTC(config) + input_lengths = tf.constant([input_values.shape[-1] // i for i in [4, 2, 1]]) + max_length_labels = model.hubert._get_feat_extract_output_lengths(input_lengths) + labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size + 100) + with pytest.raises(ValueError): + model(input_values, labels=labels) + + def prepare_config_and_inputs_for_common(self): + config, input_values, attention_mask = self.prepare_config_and_inputs() + inputs_dict = {"input_values": input_values, "attention_mask": attention_mask} + return config, inputs_dict + + +@require_tf +class TFHubertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (TFHubertModel, TFHubertForCTC) if is_tf_available() else () + pipeline_model_mapping = {"feature-extraction": TFHubertModel} if is_tf_available() else {} + test_resize_embeddings = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFHubertModelTester(self) + self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + # overwrite because input_values != input_ids + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["input_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + # overwrite because input_values != input_ids + def test_keyword_and_dict_args(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs_dict = model(inputs) + + inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + input_values = inputs_keywords.pop("input_values", None) + outputs_keywords = model(input_values, **inputs_keywords) + output_dict = outputs_dict[0].numpy() + output_keywords = outputs_keywords[0].numpy() + + self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_hidden_states_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_hidden_states_output(config, inputs_dict, model_class): + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + + hidden_states = outputs.hidden_states + self.assertEqual(config.output_attentions, False) + self.assertEqual(len(hidden_states), expected_num_layers) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [self.model_tester.output_seq_length, self.model_tester.hidden_size], + ) + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(config, inputs_dict, model_class) + + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + check_hidden_states_output(config, inputs_dict, model_class) + + def test_ctc_loss_inference(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_ctc_loss(*config_and_inputs) + + def test_train(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_training(*config_and_inputs) + + def test_labels_out_of_vocab(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_labels_out_of_vocab(*config_and_inputs) + + @unittest.skip(reason="Hubert has no input embeddings") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Hubert has no tokens embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Hubert has no input embeddings") + def test_model_common_attributes(self): + pass + + @slow + def test_model_from_pretrained(self): + model = TFHubertModel.from_pretrained("facebook/hubert-base-ls960") + self.assertIsNotNone(model) + + @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch") + def test_dataset_conversion(self): + # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC + pass + + @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch") + def test_keras_fit(self): + # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC + pass + + +@require_tf +class TFHubertRobustModelTest(TFModelTesterMixin, unittest.TestCase): + all_model_classes = (TFHubertModel, TFHubertForCTC) if is_tf_available() else () + test_resize_embeddings = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFHubertModelTester( + self, + conv_stride=(3, 3, 3), + feat_extract_norm="layer", + do_stable_layer_norm=True, + scope="robust", + ) + self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=37) + + # overwrite because input_values != input_ids + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["input_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + # overwrite because input_values != input_ids + def test_keyword_and_dict_args(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs_dict = model(inputs) + + inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + input_values = inputs_keywords.pop("input_values", None) + outputs_keywords = model(input_values, **inputs_keywords) + output_dict = outputs_dict[0].numpy() + output_keywords = outputs_keywords[0].numpy() + + self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_hidden_states_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_hidden_states_output(config, inputs_dict, model_class): + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + + hidden_states = outputs.hidden_states + self.assertEqual(config.output_attentions, False) + self.assertEqual(len(hidden_states), expected_num_layers) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [self.model_tester.output_seq_length, self.model_tester.hidden_size], + ) + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(config, inputs_dict, model_class) + + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + check_hidden_states_output(config, inputs_dict, model_class) + + def test_batched_inference(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_batch_inference(*config_and_inputs) + + def test_ctc_loss_inference(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_ctc_loss(*config_and_inputs) + + def test_train(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_training(*config_and_inputs) + + def test_labels_out_of_vocab(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_labels_out_of_vocab(*config_and_inputs) + + @unittest.skip(reason="Hubert has no input embeddings") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Hubert has no tokens embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Hubert has no input embeddings or get_input_embeddings method") + def test_model_common_attributes(self): + pass + + @slow + def test_model_from_pretrained(self): + model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft") + self.assertIsNotNone(model) + + @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch") + def test_dataset_conversion(self): + # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC + pass + + @unittest.skip(reason="Fix me! Hubert hits OOM errors when loss is computed on full batch") + def test_keras_fit(self): + # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC + pass + + +@require_tf +class TFHubertUtilsTest(unittest.TestCase): + def test_compute_mask_indices(self): + batch_size = 4 + sequence_length = 60 + mask_prob = 0.5 + mask_length = 1 + + mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length) + + self.assertListEqual( + tf.reduce_sum(mask, -1).numpy().tolist(), [mask_prob * sequence_length for _ in range(batch_size)] + ) + + def test_compute_mask_indices_overlap(self): + batch_size = 4 + sequence_length = 80 + mask_prob = 0.5 + mask_length = 4 + + mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length) + + # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal + for batch_sum in tf.reduce_sum(mask, -1): + self.assertTrue(int(batch_sum) <= mask_prob * sequence_length) + + +@require_tf +@slow +@require_soundfile +class TFHubertModelIntegrationTest(unittest.TestCase): + def _load_datasamples(self, num_samples): + from datasets import load_dataset + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").filter( + lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] + )[:num_samples]["audio"] + + return [x["array"] for x in speech_samples] + + def test_inference_ctc_normal(self): + model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") + processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft", do_lower_case=True) + input_speech = self._load_datasamples(1) + + input_values = processor(input_speech, return_tensors="tf", sampling_rate=16000).input_values + + logits = model(input_values).logits + + predicted_ids = tf.argmax(logits, axis=-1) + predicted_trans = processor.batch_decode(predicted_ids) + + EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"] + self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS) + + def test_inference_ctc_normal_batched(self): + model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") + processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft", do_lower_case=True) + + input_speech = self._load_datasamples(2) + + input_values = processor(input_speech, return_tensors="tf", padding=True, sampling_rate=16000).input_values + + logits = model(input_values).logits + + predicted_ids = tf.argmax(logits, axis=-1) + predicted_trans = processor.batch_decode(predicted_ids) + + EXPECTED_TRANSCRIPTIONS = [ + "a man said to the universe sir i exist", + "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", + ] + self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS) + + def test_inference_ctc_robust_batched(self): + model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") + processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft", do_lower_case=True) + + input_speech = self._load_datasamples(4) + + inputs = processor(input_speech, return_tensors="tf", padding=True, sampling_rate=16000) + + input_values = inputs.input_values + attention_mask = inputs.attention_mask + + logits = model(input_values, attention_mask=attention_mask).logits + + predicted_ids = tf.argmax(logits, axis=-1) + predicted_trans = processor.batch_decode(predicted_ids) + + EXPECTED_TRANSCRIPTIONS = [ + "a man said to the universe sir i exist", + "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", + "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around" + " him with the thousands of spectators were trivialities not worth thinking about", + "his instant of panic was followed by a small sharp blow high on his chest", + ] + self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS) diff --git a/docs/transformers/tests/models/ibert/__init__.py b/docs/transformers/tests/models/ibert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/ibert/test_modeling_ibert.py b/docs/transformers/tests/models/ibert/test_modeling_ibert.py new file mode 100644 index 0000000000000000000000000000000000000000..9065a7046b6d85360ad116148201fb67f0277f96 --- /dev/null +++ b/docs/transformers/tests/models/ibert/test_modeling_ibert.py @@ -0,0 +1,734 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import copy +import unittest + +from transformers import IBertConfig, is_torch_available +from transformers.testing_utils import require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import ( + IBertForMaskedLM, + IBertForMultipleChoice, + IBertForQuestionAnswering, + IBertForSequenceClassification, + IBertForTokenClassification, + IBertModel, + ) + from transformers.models.ibert.modeling_ibert import ( + IBertEmbeddings, + IntGELU, + IntLayerNorm, + IntSoftmax, + QuantAct, + QuantEmbedding, + QuantLinear, + create_position_ids_from_input_ids, + ) + + +class IBertModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return IBertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + quant_mode=True, + ) + + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = IBertModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = IBertForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = IBertForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = IBertForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = IBertForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class IBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + test_pruning = False + test_torchscript = False + test_head_masking = False + test_resize_embeddings = False + + all_model_classes = ( + ( + IBertForMaskedLM, + IBertModel, + IBertForSequenceClassification, + IBertForTokenClassification, + IBertForMultipleChoice, + IBertForQuestionAnswering, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": IBertModel, + "fill-mask": IBertForMaskedLM, + "question-answering": IBertForQuestionAnswering, + "text-classification": IBertForSequenceClassification, + "token-classification": IBertForTokenClassification, + "zero-shot": IBertForSequenceClassification, + } + if is_torch_available() + else {} + ) + + def setUp(self): + self.model_tester = IBertModelTester(self) + self.config_tester = ConfigTester(self, config_class=IBertConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + # I-BERT only supports absolute embedding + for type in ["absolute"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "kssteven/ibert-roberta-base" + model = IBertModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_create_position_ids_respects_padding_index(self): + """This is a regression test for https://github.com/huggingface/transformers/issues/1761 + + The position ids should be masked with the embedding object's padding index. Therefore, the + first available non-padding position index is IBertEmbeddings.padding_idx + 1 + """ + config = self.model_tester.prepare_config_and_inputs()[0] + model = IBertEmbeddings(config=config) + + input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]]) + expected_positions = torch.as_tensor( + [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]] + ) + + position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx) + self.assertEqual(position_ids.shape, expected_positions.shape) + self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) + + def test_create_position_ids_from_inputs_embeds(self): + """This is a regression test for https://github.com/huggingface/transformers/issues/1761 + The position ids should be masked with the embedding object's padding index. Therefore, the + first available non-padding position index is IBertEmbeddings.padding_idx + 1 + """ + config = self.model_tester.prepare_config_and_inputs()[0] + embeddings = IBertEmbeddings(config=config) + + inputs_embeds = torch.empty(2, 4, 30) + expected_single_positions = [ + 0 + embeddings.padding_idx + 1, + 1 + embeddings.padding_idx + 1, + 2 + embeddings.padding_idx + 1, + 3 + embeddings.padding_idx + 1, + ] + expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions]) + position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds) + self.assertEqual(position_ids.shape, expected_positions.shape) + self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) + + # Override + def test_model_get_set_embeddings(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), QuantEmbedding) + model.set_input_embeddings(nn.Embedding(10, 10)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + # Override + def test_feed_forward_chunking(self): + pass # I-BERT does not support chunking + + # Override + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + + if not self.is_encoder_decoder: + input_ids = inputs["input_ids"] + del inputs["input_ids"] + else: + encoder_input_ids = inputs["input_ids"] + decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) + del inputs["input_ids"] + inputs.pop("decoder_input_ids", None) + + wte = model.get_input_embeddings() + if not self.is_encoder_decoder: + embed, embed_scaling_factor = wte(input_ids) + inputs["inputs_embeds"] = embed + else: + inputs["inputs_embeds"] = wte(encoder_input_ids) + inputs["decoder_inputs_embeds"] = wte(decoder_input_ids) + + with torch.no_grad(): + model(**inputs)[0] + + @unittest.skip(reason="ibert overrides scaling to None if inputs_embeds") + def test_inputs_embeds_matches_input_ids(self): + pass + + +@require_torch +class IBertModelIntegrationTest(unittest.TestCase): + def test_quant_embedding(self): + weight_bit = 8 + embedding = QuantEmbedding(2, 4, quant_mode=True, weight_bit=weight_bit) + embedding_weight = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]]) + embedding.weight = nn.Parameter(embedding_weight) + + expected_scaling_factor = embedding_weight.abs().max() / (2 ** (weight_bit - 1) - 1) + x, x_scaling_factor = embedding(torch.tensor(0)) + y, y_scaling_factor = embedding(torch.tensor(1)) + + # scaling factor should follow the symmetric quantization rule + self.assertTrue(torch.allclose(x_scaling_factor, expected_scaling_factor, atol=1e-4)) + self.assertTrue(torch.allclose(x_scaling_factor, expected_scaling_factor, atol=1e-4)) + self.assertTrue(torch.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4)) + + # quantization error should not exceed the scaling factor + self.assertTrue(torch.allclose(x, embedding_weight[0], atol=expected_scaling_factor)) + self.assertTrue(torch.allclose(y, embedding_weight[1], atol=expected_scaling_factor)) + + def test_quant_act(self): + def _test_range(): + act = QuantAct(activation_bit, act_range_momentum, quant_mode=True) + + # First pass + x = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]]) + x_scaling_factor = torch.tensor(1.0) + y, y_scaling_factor = act(x, x_scaling_factor) + y_int = y / y_scaling_factor + + # After the first pass, x_min and x_max should be initialized with x.min() and x.max() + expected_x_min, expected_x_max = x.min(), x.max() + self.assertTrue(torch.allclose(act.x_min, expected_x_min, atol=1e-4)) + self.assertTrue(torch.allclose(act.x_max, expected_x_max, atol=1e-4)) + + # scaling factor should follow the symmetric quantization rule + expected_range = torch.max(expected_x_min.abs(), expected_x_max.abs()) + expected_scaling_factor = expected_range / (2 ** (activation_bit - 1) - 1) + self.assertTrue(torch.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4)) + + # quantization error should not exceed the scaling factor + self.assertTrue(torch.allclose(x, y, atol=expected_scaling_factor)) + + # output should be integer + self.assertTrue(torch.allclose(y_int, y_int.round(), atol=1e-4)) + + # Second Pass + x = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]]) * 2 + x_scaling_factor = torch.tensor(1.0) + y, y_scaling_factor = act(x, x_scaling_factor) + y_int = y / y_scaling_factor + + # From the second pass, x_min and x_max should be updated with moving average + expected_x_min = expected_x_min * act_range_momentum + x.min() * (1 - act_range_momentum) + expected_x_max = expected_x_max * act_range_momentum + x.max() * (1 - act_range_momentum) + self.assertTrue(torch.allclose(act.x_min, expected_x_min, atol=1e-4)) + self.assertTrue(torch.allclose(act.x_max, expected_x_max, atol=1e-4)) + + # scaling factor should follow the symmetric quantization rule + expected_range = torch.max(expected_x_min.abs(), expected_x_max.abs()) + expected_scaling_factor = expected_range / (2 ** (activation_bit - 1) - 1) + self.assertTrue(torch.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4)) + + # quantization error should not exceed the scaling factor + x = x.clamp(min=-expected_range, max=expected_range) + self.assertTrue(torch.allclose(x, y, atol=expected_scaling_factor)) + + # output should be integer + self.assertTrue(torch.allclose(y_int, y_int.round(), atol=1e-4)) + + # Third pass, with eval() + act.eval() + x = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]]) * 3 + + # In eval mode, min/max and scaling factor must be fixed + self.assertTrue(torch.allclose(act.x_min, expected_x_min, atol=1e-4)) + self.assertTrue(torch.allclose(act.x_max, expected_x_max, atol=1e-4)) + self.assertTrue(torch.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4)) + + def _test_identity(): + # test if identity and identity_scaling_factor are given + # should add the input values + act = QuantAct(activation_bit, act_range_momentum, quant_mode=True) + x = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]]) + y = torch.tensor([[6.0, -7.0, 1.0, -2.0], [3.0, -4.0, -8.0, 5.0]]) + x_scaling_factor = torch.tensor(1.0) + y_scaling_factor = torch.tensor(0.5) + z, z_scaling_factor = act(x, x_scaling_factor, y, y_scaling_factor) + z_int = z / z_scaling_factor + self.assertTrue(torch.allclose(x + y, z, atol=0.1)) + self.assertTrue(torch.allclose(z_int, z_int.round(), atol=1e-4)) + + activation_bit = 8 + act_range_momentum = 0.95 + _test_range() + _test_identity() + + def test_quant_linear(self): + def _test(per_channel): + linear_q = QuantLinear(2, 4, quant_mode=True, per_channel=per_channel, weight_bit=weight_bit) + linear_dq = QuantLinear(2, 4, quant_mode=False, per_channel=per_channel, weight_bit=weight_bit) + linear_weight = torch.tensor([[-1.0, 2.0, 3.0, -4.0], [5.0, -6.0, -7.0, 8.0]]).T + linear_q.weight = nn.Parameter(linear_weight) + linear_dq.weight = nn.Parameter(linear_weight) + + q, q_scaling_factor = linear_q(x, x_scaling_factor) + q_int = q / q_scaling_factor + dq, dq_scaling_factor = linear_dq(x, x_scaling_factor) + + if per_channel: + q_max = linear_weight.abs().max(dim=1).values + else: + q_max = linear_weight.abs().max() + expected_scaling_factor = q_max / (2 ** (weight_bit - 1) - 1) + + # scaling factor should follow the symmetric quantization rule + self.assertTrue(torch.allclose(linear_q.fc_scaling_factor, expected_scaling_factor, atol=1e-4)) + + # output of the normal linear layer and the quantized linear layer should be similar + self.assertTrue(torch.allclose(q, dq, atol=0.5)) + + # output of the quantized linear layer should be integer + self.assertTrue(torch.allclose(q_int, q_int.round(), atol=1e-4)) + + weight_bit = 8 + x = torch.tensor([[2.0, -5.0], [-3.0, 4.0]]) + x_scaling_factor = torch.tensor([1.0]) + _test(True) + _test(False) + + def test_int_gelu(self): + gelu_q = IntGELU(quant_mode=True) + gelu_dq = nn.GELU() + + x_int = torch.arange(-10000, 10001, 1) + x_scaling_factor = torch.tensor(0.001) + x = x_int * x_scaling_factor + + q, q_scaling_factor = gelu_q(x, x_scaling_factor) + q_int = q / q_scaling_factor + dq = gelu_dq(x) + + # output of the normal GELU and the quantized GELU should be similar + self.assertTrue(torch.allclose(q, dq, atol=0.5)) + + # output of the quantized GELU layer should be integer + self.assertTrue(torch.allclose(q_int, q_int.round(), atol=1e-4)) + + def test_force_dequant_gelu(self): + x_int = torch.arange(-10000, 10001, 1) + x_scaling_factor = torch.tensor(0.001) + x = x_int * x_scaling_factor + + gelu_dq = IntGELU(quant_mode=False) + gelu_fdqs_dict = { + True: [ + IntGELU(quant_mode=True, force_dequant="nonlinear"), + IntGELU(quant_mode=True, force_dequant="gelu"), + ], + False: [ + IntGELU(quant_mode=True, force_dequant="none"), + IntGELU(quant_mode=True, force_dequant="softmax"), + IntGELU(quant_mode=True, force_dequant="layernorm"), + ], + } + + dq, dq_scaling_factor = gelu_dq(x, x_scaling_factor) + for label, gelu_fdqs in gelu_fdqs_dict.items(): + for gelu_fdq in gelu_fdqs: + q, q_scaling_factor = gelu_fdq(x, x_scaling_factor) + if label: + self.assertTrue(torch.allclose(q, dq, atol=1e-4)) + else: + self.assertFalse(torch.allclose(q, dq, atol=1e-4)) + + def test_int_softmax(self): + output_bit = 8 + softmax_q = IntSoftmax(output_bit, quant_mode=True) + softmax_dq = nn.Softmax() + + def _test(array): + x_int = torch.tensor(array) + x_scaling_factor = torch.tensor(0.1) + x = x_int * x_scaling_factor + + q, q_scaling_factor = softmax_q(x, x_scaling_factor) + q_int = q / q_scaling_factor + dq = softmax_dq(x) + + # output of the normal Softmax and the quantized Softmax should be similar + self.assertTrue(torch.allclose(q, dq, atol=0.5)) + + # output of the quantized GELU layer should be integer + self.assertTrue(torch.allclose(q_int, q_int.round(), atol=1e-4)) + + # Output of the quantize Softmax should not exceed the output_bit + self.assertTrue(q.abs().max() < 2**output_bit) + + array = [[i + j for j in range(10)] for i in range(-10, 10)] + _test(array) + array = [[i + j for j in range(50)] for i in range(-10, 10)] + _test(array) + array = [[i + 100 * j for j in range(2)] for i in range(-10, 10)] + _test(array) + + def test_force_dequant_softmax(self): + output_bit = 8 + array = [[i + j for j in range(10)] for i in range(-10, 10)] + x_int = torch.tensor(array) + x_scaling_factor = torch.tensor(0.1) + x = x_int * x_scaling_factor + + softmax_dq = IntSoftmax(output_bit, quant_mode=False) + softmax_fdqs_dict = { + True: [ + IntSoftmax(output_bit, quant_mode=True, force_dequant="nonlinear"), + IntSoftmax(output_bit, quant_mode=True, force_dequant="softmax"), + ], + False: [ + IntSoftmax(output_bit, quant_mode=True, force_dequant="none"), + IntSoftmax(output_bit, quant_mode=True, force_dequant="gelu"), + IntSoftmax(output_bit, quant_mode=True, force_dequant="layernorm"), + ], + } + + dq, dq_scaling_factor = softmax_dq(x, x_scaling_factor) + for label, softmax_fdqs in softmax_fdqs_dict.items(): + for softmax_fdq in softmax_fdqs: + q, q_scaling_factor = softmax_fdq(x, x_scaling_factor) + if label: + self.assertTrue(torch.allclose(q, dq, atol=1e-4)) + else: + self.assertFalse(torch.allclose(q, dq, atol=1e-4)) + + def test_int_layernorm(self): + output_bit = 8 + + # some random matrix + array = [[[i * j * j + j for j in range(5, 15)]] for i in range(-10, 10)] + x_int = torch.tensor(array) + x_scaling_factor = torch.tensor(0.1) + x = x_int * x_scaling_factor + + ln_q = IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit) + ln_dq = nn.LayerNorm(x.shape[1:], 1e-5) + + ln_q.weight = nn.Parameter(torch.ones(x.shape[1:])) + ln_q.bias = nn.Parameter(torch.ones(x.shape[1:])) + ln_dq.weight = nn.Parameter(torch.ones(x.shape[1:])) + ln_dq.bias = nn.Parameter(torch.ones(x.shape[1:])) + + q, q_scaling_factor = ln_q(x, x_scaling_factor) + q_int = q / q_scaling_factor + dq = ln_dq(x) + + # output of the normal LN and the quantized LN should be similar + self.assertTrue(torch.allclose(q, dq, atol=0.5)) + + # output of the quantized GELU layer should be integer + self.assertTrue(torch.allclose(q_int, q_int.round(), atol=1e-4)) + + def test_force_dequant_layernorm(self): + output_bit = 8 + array = [[[i * j * j + j for j in range(5, 15)]] for i in range(-10, 10)] + x_int = torch.tensor(array) + x_scaling_factor = torch.tensor(0.1) + x = x_int * x_scaling_factor + + ln_dq = IntLayerNorm(x.shape[1:], 1e-5, quant_mode=False, output_bit=output_bit) + ln_fdqs_dict = { + True: [ + IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="nonlinear"), + IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="layernorm"), + ], + False: [ + IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="none"), + IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="gelu"), + IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="softmax"), + ], + } + + ln_dq.weight = nn.Parameter(torch.ones(x.shape[1:])) + ln_dq.bias = nn.Parameter(torch.ones(x.shape[1:])) + dq, dq_scaling_factor = ln_dq(x, x_scaling_factor) + for label, ln_fdqs in ln_fdqs_dict.items(): + for ln_fdq in ln_fdqs: + ln_fdq.weight = nn.Parameter(torch.ones(x.shape[1:])) + ln_fdq.bias = nn.Parameter(torch.ones(x.shape[1:])) + q, q_scaling_factor = ln_fdq(x, x_scaling_factor) + if label: + self.assertTrue(torch.allclose(q, dq, atol=1e-4)) + else: + self.assertFalse(torch.allclose(q, dq, atol=1e-4)) + + def quantize(self, model): + # Helper function that quantizes the given model + # Recursively convert all the `quant_mode` attributes as `True` + if hasattr(model, "quant_mode"): + model.quant_mode = True + elif isinstance(model, nn.Sequential): + for n, m in model.named_children(): + self.quantize(m) + elif isinstance(model, nn.ModuleList): + for n in model: + self.quantize(n) + else: + for attr in dir(model): + mod = getattr(model, attr) + if isinstance(mod, nn.Module) and mod != model: + self.quantize(mod) + + @slow + def test_inference_masked_lm(self): + # I-BERT should be "equivalent" to RoBERTa if not quantized + # Test coped from `test_modeling_roberta.py` + model = IBertForMaskedLM.from_pretrained("kssteven/ibert-roberta-base") + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + output = model(input_ids)[0] + expected_shape = torch.Size((1, 11, 50265)) + self.assertEqual(output.shape, expected_shape) + expected_slice = torch.tensor( + [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]] + ) + self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4)) + + # I-BERT should be "similar" to RoBERTa if quantized + self.quantize(model) + output = model(input_ids)[0] + self.assertEqual(output.shape, expected_shape) + self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=0.1)) + + @slow + def test_inference_classification_head(self): + # I-BERT should be "equivalent" to RoBERTa if not quantized + # Test coped from `test_modeling_roberta.py` + model = IBertForSequenceClassification.from_pretrained("kssteven/ibert-roberta-large-mnli") + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + output = model(input_ids)[0] + expected_shape = torch.Size((1, 3)) + self.assertEqual(output.shape, expected_shape) + expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]]) + self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4)) + + # I-BERT should be "similar" to RoBERTa if quantized + self.quantize(model) + output = model(input_ids)[0] + self.assertEqual(output.shape, expected_shape) + self.assertTrue(torch.allclose(output, expected_tensor, atol=0.1)) diff --git a/docs/transformers/tests/models/idefics/__init__.py b/docs/transformers/tests/models/idefics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/idefics/test_image_processing_idefics.py b/docs/transformers/tests/models/idefics/test_image_processing_idefics.py new file mode 100644 index 0000000000000000000000000000000000000000..36a2ebd343ed01c6a2ff15c800b14f8cd94871d9 --- /dev/null +++ b/docs/transformers/tests/models/idefics/test_image_processing_idefics.py @@ -0,0 +1,207 @@ +# Copyright 2021 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_torchvision, require_vision +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_torchvision_available(): + from torchvision import transforms + +if is_vision_available(): + from PIL import Image + + from transformers import IdeficsImageProcessor + + +class IdeficsImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + size=None, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + ): + size = size if size is not None else {"shortest_edge": 30} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + # self.size = size + self.image_mean = image_mean + self.image_std = image_std + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "image_size": self.image_size, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to IdeficsImageProcessor, + assuming do_resize is set to True with a scalar size and size_divisor. + """ + if not batched: + size = self.image_size + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] + else: + h, w = image.shape[1], image.shape[2] + scale = size / min(w, h) + if h < w: + newh, neww = size, scale * w + else: + newh, neww = scale * h, size + + max_size = int((1333 / 800) * size) + if max(newh, neww) > max_size: + scale = max_size / max(newh, neww) + newh = newh * scale + neww = neww * scale + + newh, neww = int(newh + 0.5), int(neww + 0.5) + expected_height, expected_width = ( + newh // self.size_divisor * self.size_divisor, + neww // self.size_divisor * self.size_divisor, + ) + + else: + expected_values = [] + for image in image_inputs: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + def expected_output_image_shape(self, images): + height, width = self.get_expected_values(images, batched=True) + return (self.num_channels, height, width) + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class IdeficsImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = IdeficsImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = IdeficsImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "image_size")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertNotEqual(image_processor.image_size, 30) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, image_size=42) + self.assertEqual(image_processor.image_size, 42) + + @require_torchvision + def test_torchvision_numpy_transforms_equivalency(self): + # as we had to reimplement the torchvision transforms using transformers utils we must check + # they both do the same + + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + image_processor = self.image_processing_class(**self.image_processor_dict, return_tensors="pt") + + print(image_inputs) + + def convert_to_rgb(image): + # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background + # for transparent images. The call to `alpha_composite` handles this case + if image.mode == "RGB": + return image + + image_rgba = image.convert("RGBA") + background = Image.new("RGBA", image_rgba.size, (255, 255, 255)) + alpha_composite = Image.alpha_composite(background, image_rgba) + alpha_composite = alpha_composite.convert("RGB") + return alpha_composite + + image_size = image_processor.image_size + image_mean = image_processor.image_mean + image_std = image_processor.image_std + + transform = transforms.Compose( + [ + convert_to_rgb, + transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC), + transforms.ToTensor(), + transforms.Normalize(mean=image_mean, std=image_std), + ] + ) + + pixel_values_transform_implied = image_processor(image_inputs, transform=None, return_tensors="pt") + pixel_values_transform_supplied = image_processor(image_inputs, transform=transform, return_tensors="pt") + + torch.testing.assert_close(pixel_values_transform_implied, pixel_values_transform_supplied, rtol=0.0, atol=0.0) + + @unittest.skip(reason="not supported") + def test_call_numpy(self): + pass + + @unittest.skip(reason="not supported") + def test_call_numpy_4_channels(self): + pass + + @unittest.skip(reason="not supported") + def test_call_pil(self): + pass + + @unittest.skip(reason="not supported") + def test_call_pytorch(self): + pass diff --git a/docs/transformers/tests/models/idefics/test_modeling_idefics.py b/docs/transformers/tests/models/idefics/test_modeling_idefics.py new file mode 100644 index 0000000000000000000000000000000000000000..5f6a0f1832c9cd154b616c4967d37e229ced9829 --- /dev/null +++ b/docs/transformers/tests/models/idefics/test_modeling_idefics.py @@ -0,0 +1,934 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Idefics model.""" + +import inspect +import unittest + +import pytest +from parameterized import parameterized + +from transformers import BitsAndBytesConfig, IdeficsConfig, is_torch_available, is_vision_available +from transformers.testing_utils import ( + TestCasePlus, + require_bitsandbytes, + require_torch, + require_vision, + slow, + torch_device, +) +from transformers.utils import cached_property + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION, + ModelTesterMixin, + floats_tensor, + ids_tensor, + random_attention_mask, +) +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import IdeficsForVisionText2Text, IdeficsModel, IdeficsProcessor + from transformers.models.idefics.configuration_idefics import IdeficsPerceiverConfig, IdeficsVisionConfig + +if is_vision_available(): + from PIL import Image + + +class IdeficsModelTester: + def __init__( + self, + parent, + batch_size=1, + seq_length=7, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + alpha_initializer="ones", + num_labels=3, + scope=None, + modality_type_vocab_size=2, + vision_embed_dim=32, + vision_patch_size=2, + vision_image_size=30, + vision_num_attention_heads=4, + vision_num_hidden_layers=5, + vision_intermediate_size=37, + perceiver_qk_layer_norms_perceiver=False, + perceiver_resampler_depth=2, + perceiver_resampler_head_dim=8, + perceiver_resampler_n_heads=2, + perceiver_resampler_n_latents=16, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.alpha_initializer = alpha_initializer + self.num_labels = num_labels + self.scope = scope + self.modality_type_vocab_size = modality_type_vocab_size + + self.vision_embed_dim = vision_embed_dim + self.vision_patch_size = vision_patch_size + self.vision_image_size = vision_image_size + self.vision_num_attention_heads = vision_num_attention_heads + self.vision_num_hidden_layers = vision_num_hidden_layers + self.vision_intermediate_size = vision_intermediate_size + + self.vision_config = IdeficsVisionConfig( + embed_dim=self.vision_embed_dim, + patch_size=self.vision_patch_size, + image_size=self.vision_image_size, + num_attention_heads=self.vision_num_attention_heads, + num_hidden_layers=self.vision_num_hidden_layers, + intermediate_size=self.vision_intermediate_size, + ).to_dict() + + self.perceiver_qk_layer_norms_perceiver = perceiver_qk_layer_norms_perceiver + self.perceiver_resampler_depth = perceiver_resampler_depth + self.perceiver_resampler_head_dim = perceiver_resampler_head_dim + self.perceiver_resampler_n_heads = perceiver_resampler_n_heads + self.perceiver_resampler_n_latents = perceiver_resampler_n_latents + + self.perceiver_config = IdeficsPerceiverConfig( + qk_layer_norms_perceiver=self.perceiver_qk_layer_norms_perceiver, + resampler_depth=self.perceiver_resampler_depth, + resampler_head_dim=self.perceiver_resampler_head_dim, + resampler_n_heads=self.perceiver_resampler_n_heads, + resampler_n_latents=self.perceiver_resampler_n_latents, + ) + + # we set the expected sequence length (which is used in several tests) + # this is equal to the seq length of the text tokens + number of image patches + 1 for the CLS token + self.expected_seq_len = self.seq_length + (self.image_size // self.patch_size) ** 2 + 1 + + def prepare_config_and_inputs(self, num_images=1, interpolate_pos_encoding=False, image_expansion=0): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + pixel_values = floats_tensor( + [ + self.batch_size, + num_images, + self.num_channels, + self.image_size + image_expansion, + self.image_size + image_expansion, + ] + ) + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + image_attention_mask = random_attention_mask([self.batch_size, self.seq_length, num_images]) + + config = self.get_config() + return (config, input_ids, input_mask, pixel_values, image_attention_mask, interpolate_pos_encoding) + + def prepare_config_and_inputs_gate_tests(self): + # Create a list of configs and inputs, to test 2 things: + # 1. For the same image, the output should be different when image_attention_mask is filled with 0s vs filled with 1s. + # 2. For 2 different images, the output should be the same when image_attention_mask is filled with 0s. + + interpolate_pos_encoding = False + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + pixel_values = floats_tensor( + [ + self.batch_size, + 1, + self.num_channels, + self.image_size, + self.image_size, + ] + ) + pixel_values_list = [ + pixel_values.clone(), + pixel_values.clone(), + pixel_values.clone().fill_(0.6), + pixel_values.clone().fill_(0.3), + ] + attention_mask = None + if self.use_input_mask: + attention_mask = random_attention_mask([self.batch_size, self.seq_length]) + + image_attention_mask = random_attention_mask([self.batch_size, self.seq_length, 1]) + image_attention_mask_list = [ + image_attention_mask.clone().fill_(0), + image_attention_mask.clone().fill_(1), + image_attention_mask.clone().fill_(0), + image_attention_mask.clone().fill_(0), + ] + + config = self.get_config() + inputs_list = [] + for pixel_values, image_attention_mask in zip(pixel_values_list, image_attention_mask_list): + inputs_list.append( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "image_attention_mask": image_attention_mask, + "interpolate_pos_encoding": interpolate_pos_encoding, + } + ) + + inputs_w_same_img = inputs_list[:2] + inputs_w_0_img_attn = inputs_list[2:] + return config, inputs_w_same_img, inputs_w_0_img_attn + + def get_config(self): + return IdeficsConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + alpha_initializer=self.alpha_initializer, + num_labels=self.num_labels, + modality_type_vocab_size=self.modality_type_vocab_size, + vision_config=self.vision_config, + ) + + def create_and_check_model( + self, + config, + input_ids, + input_mask, + pixel_values, + image_attention_mask, + interpolate_pos_encoding, + ): + model = IdeficsModel(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + pixel_values=pixel_values, + image_attention_mask=image_attention_mask, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, input_ids.shape[1], self.hidden_size) + ) + + def create_and_check_model_gen( + self, + config, + input_ids, + input_mask, + pixel_values, + image_attention_mask, + interpolate_pos_encoding, + ): + model = IdeficsForVisionText2Text(config) + model.to(torch_device) + model.eval() + model.generate( + input_ids, + attention_mask=input_mask, + pixel_values=pixel_values, + image_attention_mask=image_attention_mask, + interpolate_pos_encoding=interpolate_pos_encoding, + max_length=self.seq_length + 2, + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + input_mask, + pixel_values, + image_attention_mask, + interpolate_pos_encoding, + ) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": input_mask, + "pixel_values": pixel_values, + "image_attention_mask": image_attention_mask, + "interpolate_pos_encoding": interpolate_pos_encoding, + } + return config, inputs_dict + + def prepare_pixel_values(self): + return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) + @unittest.skip(reason="Idefics has a hard requirement on SDPA, skipping this test") + def test_eager_matches_sdpa_inference( + self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels + ): + pass + + +@require_torch +class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else () + # Doesn't run generation tests here -- idefics has a dedicated tester for generation tests below + all_generative_model_classes = () + pipeline_model_mapping = ( + {"feature-extraction": IdeficsModel, "image-text-to-text": IdeficsForVisionText2Text} + if is_torch_available() + else {} + ) + test_pruning = False + test_headmasking = False + test_torchscript = False + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + # XXX: IdeficsForVisionText2TextTest has no MODEL_FOR group yet, but it should be the same + # as MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, so for now manually changing to do the right thing + # as super won't do it + if return_labels: + inputs_dict["labels"] = torch.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device + ) + + return inputs_dict + + @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) + @unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.") + def test_eager_matches_sdpa_inference( + self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels + ): + pass + + def test_model_outputs_equivalence(self): + try: + orig = self.all_model_classes + # IdeficsModel.forward doesn't have labels input arg - only IdeficsForVisionText2Text does + self.all_model_classes = (IdeficsForVisionText2Text,) if is_torch_available() else () + super().test_model_outputs_equivalence() + finally: + self.all_model_classes = orig + + def setUp(self): + self.model_tester = IdeficsModelTester(self) + self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model_single_image(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=1, interpolate_pos_encoding=False, image_expansion=0 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_multiple_images(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=2, interpolate_pos_encoding=False, image_expansion=0 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_with_image_pos_embeddings_interpolation_single_image(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=1, interpolate_pos_encoding=True, image_expansion=2 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=1, interpolate_pos_encoding=True, image_expansion=0 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_with_image_pos_embeddings_interpolation_multiple_images(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=2, interpolate_pos_encoding=True, image_expansion=2 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=2, interpolate_pos_encoding=True, image_expansion=0 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_generate_with_image_pos_embeddings_interpolation_single_image(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=1, interpolate_pos_encoding=True, image_expansion=2 + ) + self.model_tester.create_and_check_model_gen(*config_and_inputs) + + def test_generate_with_image_pos_embeddings_interpolation_multiple_images(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=2, interpolate_pos_encoding=True, image_expansion=2 + ) + self.model_tester.create_and_check_model_gen(*config_and_inputs) + + def test_cross_attention_gates(self): + config, inputs_w_same_img, inputs_w_0_img_attn = self.model_tester.prepare_config_and_inputs_gate_tests() + + model = IdeficsModel(config=config).to(torch_device) + model.eval() + test_1_results = [] + for inputs in inputs_w_same_img: + with torch.no_grad(): + last_hidden_states = model(**inputs).last_hidden_state + last_hidden_states = model(**inputs).last_hidden_state + test_1_results.append(last_hidden_states) + self.assertNotEqual(test_1_results[0].sum().item(), test_1_results[1].sum().item()) + + test_2_results = [] + for inputs in inputs_w_0_img_attn: + with torch.no_grad(): + last_hidden_states = model(**inputs).last_hidden_state + test_2_results.append(last_hidden_states) + self.assertEqual(test_2_results[0].sum().item(), test_2_results[1].sum().item()) + + def test_training(self): + if not self.model_tester.is_training: + self.skipTest(reason="model_tester.is_training is set to False") + + for model_class in self.all_model_classes: + # IdeficsModel does not support training, users should use + # IdeficsForVisionText2Text for this purpose + if model_class == IdeficsModel: + self.skipTest(reason="IdeficsModel does not support training") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + if not self.model_tester.is_training: + self.skipTest(reason="model_tester.is_training is set to False") + + for model_class in self.all_model_classes: + # IdeficsModel does not support training, users should use + # IdeficsForVisionText2Text for this purpose + if model_class == IdeficsModel: + self.skipTest(reason="IdeficsModel does not support training") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + + model = model_class(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="""IDEFICS does not support retaining the gradients of the hidden states and attention""") + def test_retain_grad_hidden_states_attentions(self): + return + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + # IDEFICS does not support outputting attention score because it uses SDPA under the hood + self.assertTrue(attentions[0] is None) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + 1, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + # IDEFICS does not support outputting attention score because it uses SDPA under the hood + self.assertTrue(self_attentions[0] is None) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + seq_length = self.model_tester.seq_length + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + @slow + def test_model_from_pretrained(self): + model_name = "HuggingFaceM4/idefics-9b" + model = IdeficsModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip("Idefics has a hard requirement on SDPA") + def test_sdpa_can_dispatch_non_composite_models(self): + pass + + +@require_torch +class IdeficsForVisionText2TextTest(IdeficsModelTest, GenerationTesterMixin, unittest.TestCase): + all_model_classes = (IdeficsForVisionText2Text,) if is_torch_available() else () + + def setUp(self): + self.model_tester = IdeficsModelTester( + self, + modality_type_vocab_size=3, + ) + self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37) + + @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) + @unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.") + def test_eager_matches_sdpa_inference( + self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels + ): + pass + + @pytest.mark.generate + def test_left_padding_compatibility(self): + """Overwrite because IDEFICS needs image attention mask to be also padded""" + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + def _prepare_model_kwargs(input_ids, attention_mask, image_attention_mask, signature): + model_kwargs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "image_attention_mask": image_attention_mask, + } + if "position_ids" in signature: + position_ids = torch.cumsum(attention_mask, dim=-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + model_kwargs["position_ids"] = position_ids + if "cache_position" in signature: + cache_position = torch.arange(input_ids.shape[-1], device=torch_device) + model_kwargs["cache_position"] = cache_position + return model_kwargs + + for model_class in self.all_generative_model_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict.pop("input_ids") + attention_mask = inputs_dict.pop("attention_mask") + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + image_attention_mask = inputs_dict.pop("image_attention_mask", None) + + model = model_class(config).to(torch_device).eval() + signature = inspect.signature(model.forward).parameters.keys() + + # no cache as some models require special cache classes to be init outside forward + model.generation_config.use_cache = False + + # Without padding + model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, image_attention_mask, signature) + next_logits_wo_padding = model(**model_kwargs, **inputs_dict).logits[:, -1, :] + + # With left-padding (length 32) + # can hardcode pad_token to be 0 as we'll do attn masking anyway + pad_token_id = ( + config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 + ) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + + pad_size_img = (input_ids.shape[0], 32, image_attention_mask.shape[-1]) + extra_img_mask = torch.zeros(pad_size_img, dtype=image_attention_mask.dtype, device=torch_device) + padded_image_attention_mask = torch.cat([extra_img_mask, image_attention_mask], dim=1) + model_kwargs = _prepare_model_kwargs( + padded_input_ids, padded_attention_mask, padded_image_attention_mask, signature + ) + next_logits_with_padding = model(**model_kwargs, **inputs_dict).logits[:, -1, :] + + # They should result in very similar logits + torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) + + @pytest.mark.generate + def test_generate_continue_from_past_key_values(self): + """Overwrite because IDEFICS needs image attention mask to be also processed""" + + # Tests that we can continue generating from past key values, returned from a previous `generate` call + for model_class in self.all_generative_model_classes: + config, inputs = self.model_tester.prepare_config_and_inputs_for_common() + + # Let's make it always: + # 1. use cache (for obvious reasons) + # 2. generate to max length (which can be achieved by setting the eos token to an invalid value), which + # would make the test flaky (e.g. EOS is generated on iteration 1 on both generations, but the + # continuation would force it to generate beyond an EOS token) + # 3. ignore `token_type_ids` for simplicity + # 4. ignore `forced_eos_token_id`, which requires further manipulation of the continuation inputs and is + # active by default on some models + # 5. ignore `encoder_no_repeat_ngram_size`, which is set by default in some encoder-decoder models. When + # we use their decoder as a stand-alone model, `encoder_no_repeat_ngram_size` actually prevents + # repetition exclusively from the prompt. This test relies on comparing one call vs 2 calls + # with cache, what is considered a prompt is different in the two cases. + + model = model_class(config).to(torch_device) + model.eval() + model.generation_config.pad_token_id = model.generation_config.eos_token_id = -1 + model.generation_config.forced_eos_token_id = None + model.generation_config.encoder_no_repeat_ngram_size = 0 + model.generation_config.use_cache = True + + # Traditional way of generating text, with `return_dict_in_generate` to return the past key values + outputs = model.generate(**inputs, do_sample=False, max_new_tokens=4, return_dict_in_generate=True) + + # Let's generate again, but passing the past key values in between (3 + 1 = 4 tokens). Note that the + # inputs may need to be tweaked across `generate` calls (like the attention mask). + outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=3, return_dict_in_generate=True) + + # Continue from the tokens generated above, preparing the inputs accordingly + inputs["past_key_values"] = outputs_cached.past_key_values + new_attention_len = outputs_cached.sequences.shape[-1] + inputs["input_ids"] = outputs_cached.sequences + if "attention_mask" in inputs: + inputs["attention_mask"] = torch.nn.functional.pad( + inputs["attention_mask"], + (0, new_attention_len - inputs["attention_mask"].shape[1]), + mode="constant", + value=1, + ) + if "image_attention_mask" in inputs: + inputs["image_attention_mask"] = inputs["image_attention_mask"][:, -1:, :] + + outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=1, return_dict_in_generate=True) + + # The two sets of generated text and past kv should be equal to each other + self.assertListEqual(outputs.sequences.tolist(), outputs_cached.sequences.tolist()) + for layer_idx in range(len(outputs_cached.past_key_values)): + for kv_idx in range(len(outputs_cached.past_key_values[layer_idx])): + self.assertTrue( + torch.allclose( + outputs.past_key_values[layer_idx][kv_idx], + outputs_cached.past_key_values[layer_idx][kv_idx], + ) + ) + + @pytest.mark.generate + def test_generate_without_input_ids(self): + """Overwrite because IDEFICS needs image attention mask to be also processed and requires image at input always.""" + + config, input_dict = self.prepare_config_and_inputs_for_generate() + pixel_values = input_dict["pixel_values"] + image_attention_mask = input_dict["image_attention_mask"][:, -1:, :] + + # hack in case they are equal, otherwise the attn mask will be [0] + if config.bos_token_id == config.pad_token_id: + config.pad_token_id = None + + for model_class in self.all_generative_model_classes: + model = model_class(config).to(torch_device) + model.eval() + + output_ids_generate = model.generate( + pixel_values=pixel_values, + image_attention_mask=image_attention_mask, + do_sample=False, + max_new_tokens=self.max_new_tokens, + remove_invalid_values=True, + ) + self.assertIsNotNone(output_ids_generate) + + @pytest.mark.generate + def test_generate_continue_from_inputs_embeds(self): + """Overwrite for IDEFICS: Ensure image attention mask is processed while continuing from `inputs_embeds`.""" + + for model_class in self.all_generative_model_classes: + config, inputs = self.model_tester.prepare_config_and_inputs_for_common() + print(inputs) + + model = model_class(config).to(torch_device).eval() + + model.generation_config.pad_token_id = model.generation_config.eos_token_id = -1 + model.generation_config.forced_eos_token_id = None + model.generation_config.use_cache = True + + input_ids = inputs.pop("input_ids") + input_embeds = model.get_input_embeddings()(input_ids) + + generation_kwargs = { + "return_dict_in_generate": True, + "do_sample": False, + } + + inputs["inputs_embeds"] = input_embeds + + # Traditional way of generating text, with `return_dict_in_generate` to return the past key values + outputs = model.generate(**inputs, max_new_tokens=4, **generation_kwargs) + # Let's generate again, but passing the past key values in between (3 + 1 = 4 tokens). Note that the + # inputs may need to be tweaked across `generate` calls (like the attention mask). + initial_output = model.generate(**inputs, max_new_tokens=3, **generation_kwargs) + inputs["past_key_values"] = initial_output.past_key_values + + new_attention_len = input_ids.shape[1] + initial_output.sequences.shape[-1] + continued_embeds = torch.cat([input_embeds, model.get_input_embeddings()(initial_output.sequences)], dim=1) + inputs["inputs_embeds"] = continued_embeds + + if "attention_mask" in inputs: + inputs["attention_mask"] = torch.nn.functional.pad( + inputs["attention_mask"], + (0, new_attention_len - inputs["attention_mask"].shape[1]), + mode="constant", + value=1, + ) + if "image_attention_mask" in inputs: + inputs["image_attention_mask"] = inputs["image_attention_mask"][..., -1:, :] + + cached_output = model.generate(**inputs, max_new_tokens=1, **generation_kwargs) + + # Verify that the combined outputs match the full generation. + combined_output_sequences = torch.concat([initial_output.sequences, cached_output.sequences], axis=1) + self.assertListEqual(outputs.sequences.tolist(), combined_output_sequences.tolist()) + for layer_idx in range(len(cached_output.past_key_values)): + for kv_idx in range(len(cached_output.past_key_values[layer_idx])): + self.assertTrue( + torch.allclose( + outputs.past_key_values[layer_idx][kv_idx], + cached_output.past_key_values[layer_idx][kv_idx], + ) + ) + + def _check_attentions_for_generate( + self, batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values + ): + """ + Overwrite from generation tests because Idefics has only SDPA layers. + Do not skip because we still want generation tests to run. Rather we can remove checks for shape. + """ + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip(reason="We only test the model that takes in multiple images") + def test_custom_4d_attention_mask(self): + pass + + @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs") + def test_generate_with_static_cache(self): + pass + + @unittest.skip(reason="We only test the model that takes in multiple images") + def test_model(self): + pass + + @unittest.skip(reason="We only test the model that takes in multiple images") + def test_for_token_classification(self): + pass + + @unittest.skip(reason="""IDEFICS does not support retaining the gradients of the hidden states and attention""") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip("Idefics has a hard requirement on SDPA") + def test_sdpa_can_dispatch_non_composite_models(self): + pass + + @unittest.skip( + "Idefics has a separate test runner for generation tests with complex inheritance, causing this check to fail" + ) + def test_generation_tester_mixin_inheritance(self): + pass + + +@require_torch +@require_vision +class IdeficsModelIntegrationTest(TestCasePlus): + @cached_property + def default_processor(self): + return ( + IdeficsProcessor.from_pretrained("HuggingFaceM4/idefics-9b", revision="refs/pr/11") + if is_vision_available() + else None + ) + + @require_bitsandbytes + @slow + def test_inference_natural_language_visual_reasoning(self): + cat_image_path = self.tests_dir / "fixtures/tests_samples/COCO/000000039769.png" + cats_image_obj = Image.open(cat_image_path) # 2 cats + dogs_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg" + + prompts = [ + [ + "User:", + dogs_image_url, + "Describe this image.\nAssistant: An image of two dogs.\n", + "User:", + cats_image_obj, + "Describe this image.\nAssistant:", + ], + [ + "User:", + cats_image_obj, + "Describe this image.\nAssistant: An image of two kittens.\n", + "User:", + dogs_image_url, + "Describe this image.\nAssistant:", + ], + ] + + # the CI gpu is small so using quantization to fit + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype="float16", + ) + model = IdeficsForVisionText2Text.from_pretrained( + "HuggingFaceM4/idefics-9b", quantization_config=quantization_config, device_map="auto" + ) + processor = self.default_processor + inputs = processor(text=prompts, return_tensors="pt", padding="longest").to(torch_device) + generated_ids = model.generate(**inputs, max_length=100) + generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) + + # keep for debugging + for i, t in enumerate(generated_text): + t = bytes(t, "utf-8").decode("unicode_escape") + print(f"{i}:\n{t}\n") + + self.assertIn("image of two cats", generated_text[0]) + self.assertIn("image of two dogs", generated_text[1]) diff --git a/docs/transformers/tests/models/idefics/test_modeling_tf_idefics.py b/docs/transformers/tests/models/idefics/test_modeling_tf_idefics.py new file mode 100644 index 0000000000000000000000000000000000000000..bd7c9b06c29d65cf859af6094dd7e54eb1e6c7d5 --- /dev/null +++ b/docs/transformers/tests/models/idefics/test_modeling_tf_idefics.py @@ -0,0 +1,559 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the TF Idefics model.""" + +import os +import tempfile +import unittest +from importlib import import_module + +from transformers import IdeficsConfig, is_tf_available, is_vision_available +from transformers.testing_utils import TestCasePlus, require_tf, require_vision, slow +from transformers.utils import cached_property + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import IdeficsProcessor, TFIdeficsForVisionText2Text, TFIdeficsModel + from transformers.modeling_tf_utils import keras + from transformers.models.idefics.configuration_idefics import IdeficsPerceiverConfig, IdeficsVisionConfig + +if is_vision_available(): + from PIL import Image + + +IDEFICS_TINY_RANDOM_MODEL = "HuggingFaceM4/tiny-random-idefics" + + +class IdeficsModelTester: + def __init__( + self, + parent, + batch_size=1, + seq_length=7, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + scope=None, + modality_type_vocab_size=2, + vision_embed_dim=32, + vision_patch_size=2, + vision_image_size=30, + vision_num_attention_heads=4, + vision_num_hidden_layers=5, + vision_intermediate_size=37, + perceiver_qk_layer_norms_perceiver=False, + perceiver_resampler_depth=2, + perceiver_resampler_head_dim=8, + perceiver_resampler_n_heads=2, + perceiver_resampler_n_latents=16, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.scope = scope + self.modality_type_vocab_size = modality_type_vocab_size + + self.vision_embed_dim = vision_embed_dim + self.vision_patch_size = vision_patch_size + self.vision_image_size = vision_image_size + self.vision_num_attention_heads = vision_num_attention_heads + self.vision_num_hidden_layers = vision_num_hidden_layers + self.vision_intermediate_size = vision_intermediate_size + + self.vision_config = IdeficsVisionConfig( + embed_dim=self.vision_embed_dim, + patch_size=self.vision_patch_size, + image_size=self.vision_image_size, + num_attention_heads=self.vision_num_attention_heads, + num_hidden_layers=self.vision_num_hidden_layers, + intermediate_size=self.vision_intermediate_size, + ) + + self.perceiver_qk_layer_norms_perceiver = perceiver_qk_layer_norms_perceiver + self.perceiver_resampler_depth = perceiver_resampler_depth + self.perceiver_resampler_head_dim = perceiver_resampler_head_dim + self.perceiver_resampler_n_heads = perceiver_resampler_n_heads + self.perceiver_resampler_n_latents = perceiver_resampler_n_latents + + self.perceiver_config = IdeficsPerceiverConfig( + qk_layer_norms_perceiver=self.perceiver_qk_layer_norms_perceiver, + resampler_depth=self.perceiver_resampler_depth, + resampler_head_dim=self.perceiver_resampler_head_dim, + resampler_n_heads=self.perceiver_resampler_n_heads, + resampler_n_latents=self.perceiver_resampler_n_latents, + ) + + # we set the expected sequence length (which is used in several tests) + # this is equal to the seq length of the text tokens + number of image patches + 1 for the CLS token + self.expected_seq_len = self.seq_length + (self.image_size // self.patch_size) ** 2 + 1 + + def prepare_config_and_inputs(self, num_images=1, interpolate_pos_encoding=False, image_expansion=0): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + pixel_values = floats_tensor( + [ + self.batch_size, + num_images, + self.num_channels, + self.image_size + image_expansion, + self.image_size + image_expansion, + ] + ) + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + image_attention_mask = random_attention_mask([self.batch_size, self.seq_length, num_images]) + + config = self.get_config() + return (config, input_ids, input_mask, pixel_values, image_attention_mask, interpolate_pos_encoding) + + def get_config(self): + return IdeficsConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + num_labels=self.num_labels, + modality_type_vocab_size=self.modality_type_vocab_size, + vision_config=self.vision_config, + ) + + def create_and_check_model( + self, + config, + input_ids, + input_mask, + pixel_values, + image_attention_mask, + interpolate_pos_encoding, + ): + model = TFIdeficsModel(config=config) + result = model( + input_ids, + attention_mask=input_mask, + pixel_values=pixel_values, + image_attention_mask=image_attention_mask, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, input_ids.shape[1], self.hidden_size) + ) + + def create_and_check_model_gen( + self, + config, + input_ids, + input_mask, + pixel_values, + image_attention_mask, + interpolate_pos_encoding, + ): + model = TFIdeficsForVisionText2Text(config) + model.generate( + input_ids, + attention_mask=input_mask, + pixel_values=pixel_values, + image_attention_mask=image_attention_mask, + interpolate_pos_encoding=interpolate_pos_encoding, + max_length=self.seq_length + 2, + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + input_mask, + pixel_values, + image_attention_mask, + interpolate_pos_encoding, + ) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": input_mask, + "pixel_values": pixel_values, + "image_attention_mask": image_attention_mask, + "interpolate_pos_encoding": interpolate_pos_encoding, + } + return config, inputs_dict + + def prepare_pixel_values(self): + return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + +@require_tf +class TFIdeficsModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (TFIdeficsModel, TFIdeficsForVisionText2Text) if is_tf_available() else () + pipeline_model_mapping = {"feature-extraction": TFIdeficsModel} if is_tf_available() else {} + test_pruning = False + test_headmasking = False + test_onnx = False + test_resize_embeddings = False + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + # XXX: IdeficsForVisionText2TextTest has no MODEL_FOR group yet, but it should be the same + # as MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, so for now manually changing to do the right thing + # as super won't do it + if return_labels: + inputs_dict["labels"] = tf.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int64 + ) + return inputs_dict + + def test_model_outputs_equivalence(self): + try: + orig = self.all_model_classes + # IdeficsModel.forward doesn't have labels input arg - only IdeficsForVisionText2Text does + self.all_model_classes = (TFIdeficsForVisionText2Text,) if is_tf_available() else () + super().test_model_outputs_equivalence() + finally: + self.all_model_classes = orig + + def setUp(self): + self.model_tester = IdeficsModelTester(self) + self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model_single_image(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=1, interpolate_pos_encoding=False, image_expansion=0 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_multiple_images(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=2, interpolate_pos_encoding=False, image_expansion=0 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_with_image_pos_embeddings_interpolation_single_image(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=1, interpolate_pos_encoding=True, image_expansion=2 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=1, interpolate_pos_encoding=True, image_expansion=0 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_with_image_pos_embeddings_interpolation_multiple_images(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=2, interpolate_pos_encoding=True, image_expansion=2 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=2, interpolate_pos_encoding=True, image_expansion=0 + ) + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_generate_with_image_pos_embeddings_interpolation_single_image(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=1, interpolate_pos_encoding=True, image_expansion=2 + ) + self.model_tester.create_and_check_model_gen(*config_and_inputs) + + def test_generate_with_image_pos_embeddings_interpolation_multiple_images(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs( + num_images=2, interpolate_pos_encoding=True, image_expansion=2 + ) + self.model_tester.create_and_check_model_gen(*config_and_inputs) + + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip(reason="""IDEFICS does not support retaining the gradients of the hidden states and attention""") + def test_retain_grad_hidden_states_attentions(self): + return + + @unittest.skip(reason="IDEFICS uses out-of-bounds embeddings deliberately.") + def test_embeddings_out_of_bounds_raise_exception(self): + pass + + @unittest.skip(reason="IDEFICS attention weights are not extracted in scaled_dot_product_attention") + def test_prepare_serving_output(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + # IDEFICS does not support outputting attention score because it uses SDPA under the hood + self.assertTrue(attentions[0] is None) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(out_len + 1, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + # IDEFICS does not support outputting attention score because it uses SDPA under the hood + self.assertTrue(self_attentions[0] is None) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + seq_length = self.model_tester.seq_length + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_keras_save_load(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + tf_main_layer_classes = { + module_member + for model_class in self.all_model_classes + for module in (import_module(model_class.__module__),) + for module_member_name in dir(module) + if module_member_name.endswith("MainLayer") + for module_member in (getattr(module, module_member_name),) + if isinstance(module_member, type) + and keras.layers.Layer in module_member.__bases__ + and getattr(module_member, "_keras_serializable", False) + } + + for main_layer_class in tf_main_layer_classes: + main_layer = main_layer_class(config) + + symbolic_inputs = { + name: keras.Input(tensor.shape[1:], dtype=tensor.dtype, batch_size=2) + for name, tensor in inputs_dict.items() + if tf.is_tensor(tensor) + } + model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) + outputs = model(inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + filepath = os.path.join(tmpdirname, "keras_model.h5") + model.save(filepath) + model = keras.models.load_model(filepath, custom_objects={main_layer_class.__name__: main_layer_class}) + assert isinstance(model, keras.Model) + after_outputs = model(inputs_dict) + self.assert_outputs_same(after_outputs, outputs) + + @unittest.skip(reason="IDEFICS test_keras_fit testing done in TFIdeficsForVisionText2TextTest") + def test_keras_fit(self): + pass + + @slow + def test_model_from_pretrained(self): + model = TFIdeficsModel.from_pretrained(IDEFICS_TINY_RANDOM_MODEL, from_pt=True) + self.assertIsNotNone(model) + + @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.") + def test_saved_model_creation(self): + pass + + @unittest.skip(reason="""IDEFICS loss computation not implemented yet""") + def test_loss_computation(self): + pass + + +@require_tf +class TFIdeficsForVisionText2TextTest(TFIdeficsModelTest, unittest.TestCase): + all_model_classes = (TFIdeficsForVisionText2Text,) if is_tf_available() else () + test_resize_embeddings = False + + def setUp(self): + self.model_tester = IdeficsModelTester( + self, + modality_type_vocab_size=3, + ) + self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37) + + @unittest.skip("We only test the model that takes in multiple images") + def test_model(self): + pass + + @unittest.skip("We only test the model that takes in multiple images") + def test_for_token_classification(self): + pass + + @unittest.skip(reason="""IDEFICS does not support retaining the gradients of the hidden states and attention""") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="""IDEFICS loss computation not implemented yet""") + def test_loss_computation(self): + pass + + @slow + def test_keras_fit(self): + super().test_keras_fit() + + +# Below is the expected output for the integration test TFIdeficsModelIntegrationTest. +# Since we are using tiny-random to be able to fit it on the CI GPU,it is better to assert on the +# ids because the generated text is gibberish + +# fmt: off +EXPECTED_GENERATED_IDS = [[0, 0, 1, 4911, 29901, 32000, 32001, 32000, 20355, 915, 445, 1967, 29889, 13, 7900, 22137, 29901, 530, 1967, 310, 1023, 26361, 29889, 13, 2659, 29901, 32000, 32001, 32000, 20355, 915, 445, 1967, 29889, 13, 7900, 22137, 29901, 25519, 22326, 8071, 26357, 28004, 4428, 5916, 14383, 1033, 12358, 10536, 21834, 10447, 21201, 18102, 16886, 8875, 25388, 25914, 28304, 8558, 31048, 1322, 25952, 189, 31600, 3600, 12824, 7045, 28090, 20228, 32001, 5385, 29186, 2165, 11822, 13825, 23077, 7883, 22504, 2078, 18893, 2179, 10556, 9515, 7672, 3491, 12403, 5398, 27299, 6463, 16349, 23037, 28956, 16960, 22664, 7724, 17587, 17424, 10175, 17417, 5930, 30855, 17695, 16170, 14474, 29996, 313, 14502, 3241, 13618, 32001, 5385, 29186, 2165, 11822, 13825, 19934, 4875, 27142, 3230, 2709, 28054, 3270, 19148, 10917, 1060, 26443, 12259, 1347, 28482, 3830, 25519, 199, 12782, 9144, 12289, 1142, 18400, 21390, 19129, 7292, 28430, 24711, 5551, 30349, 30533, 13271, 17697, 4982, 8713, 5380, 17869, 12490, 5398, 27299, 11593, 19918, 15924, 29430, 10175, 17417, 5930, 30855, 17695, 16170, 14474, 19234], + [1, 4911, 29901, 32000, 32001, 32000, 20355, 915, 445, 1967, 29889, 13, 7900, 22137, 29901, 530, 1967, 310, 1023, 413, 986, 575, 29889, 13, 2659, 29901, 32000, 32001, 32000, 20355, 915, 445, 1967, 29889, 13, 7900, 22137, 29901, 25519, 22326, 8071, 26357, 28004, 4428, 17554, 20500, 21714, 27834, 4798, 12195, 30379, 5427, 20228, 10473, 14351, 8049, 15605, 14491, 212, 2711, 32000, 21714, 31259, 24368, 19036, 22970, 26083, 19394, 20372, 7672, 9939, 25388, 30533, 8200, 30271, 2114, 24749, 13224, 10603, 21118, 2179, 3759, 16515, 6587, 1287, 23998, 17793, 32001, 5385, 29186, 2165, 11822, 13825, 29732, 17503, 2729, 6722, 2943, 1221, 16043, 18244, 24965, 14383, 19840, 5980, 13488, 28531, 735, 26146, 22504, 2078, 18893, 20372, 7672, 32001, 5385, 29186, 2165, 11822, 13825, 29732, 17503, 2729, 6722, 19551, 220, 10528, 28940, 4453, 28266, 15416, 18693, 8199, 1153, 27706, 29231, 29186, 2165, 11822, 13825, 29732, 17503, 2729, 6722, 19551, 8231, 10739, 31992, 25906, 22254, 23127, 7689, 19614, 1149, 18844, 23037, 28956, 16960, 22664, 6975, 28938, 24002, 11026, 15020, 21964, 16307], ] + +@require_tf +@require_vision +class TFIdeficsModelIntegrationTest(TestCasePlus): + @cached_property + def default_processor(self): + return IdeficsProcessor.from_pretrained(IDEFICS_TINY_RANDOM_MODEL) if is_vision_available() else None + + @slow + def test_inference_natural_language_visual_reasoning(self): + cat_image_path = self.tests_dir / "fixtures/tests_samples/COCO/000000039769.png" + cats_image_obj = Image.open(cat_image_path) # 2 cats + dogs_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg" + + prompts = [ + [ + "User:", + dogs_image_url, + "Describe this image.\nAssistant: An image of two dogs.\n", + "User:", + cats_image_obj, + "Describe this image.\nAssistant:", + ], + [ + "User:", + cats_image_obj, + "Describe this image.\nAssistant: An image of two kittens.\n", + "User:", + dogs_image_url, + "Describe this image.\nAssistant:", + ], + ] + + model = TFIdeficsForVisionText2Text.from_pretrained(IDEFICS_TINY_RANDOM_MODEL, from_pt=True) + processor = self.default_processor + inputs = processor(prompts, return_tensors="tf") + generated_ids = model.generate(**inputs, max_length=100) + generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) + + # keep for debugging + for i, t in enumerate(generated_text): + t = bytes(t, "utf-8").decode("unicode_escape") + print(f"{i}:\n{t}\n") + + self.assertListEqual(EXPECTED_GENERATED_IDS[0], generated_ids[0].numpy().tolist()) + self.assertListEqual(EXPECTED_GENERATED_IDS[1], generated_ids[1].numpy().tolist()) diff --git a/docs/transformers/tests/models/idefics/test_processor_idefics.py b/docs/transformers/tests/models/idefics/test_processor_idefics.py new file mode 100644 index 0000000000000000000000000000000000000000..483d1ad1e90c607ebe20fd790542f6f4e435cc6c --- /dev/null +++ b/docs/transformers/tests/models/idefics/test_processor_idefics.py @@ -0,0 +1,221 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest + +import numpy as np + +from transformers import ( + AutoProcessor, + IdeficsImageProcessor, + IdeficsProcessor, + LlamaTokenizerFast, + PreTrainedTokenizerFast, +) +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + +@require_torch +@require_vision +class IdeficsProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = IdeficsProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + image_processor = IdeficsImageProcessor(return_tensors="pt") + tokenizer = LlamaTokenizerFast.from_pretrained("HuggingFaceM4/tiny-random-idefics") + + processor = IdeficsProcessor(image_processor, tokenizer) + + processor.save_pretrained(cls.tmpdirname) + + cls.input_keys = ["pixel_values", "input_ids", "attention_mask", "image_attention_mask"] + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + def prepare_prompts(self): + """This function prepares a list of PIL images""" + + num_images = 2 + images = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8) for x in range(num_images)] + images = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in images] + + # print([type(x) for x in images]) + # die + + prompts = [ + # text and 1 image + [ + "User:", + images[0], + "Describe this image.\nAssistant:", + ], + # text and images + [ + "User:", + images[0], + "Describe this image.\nAssistant: An image of two dogs.\n", + "User:", + images[1], + "Describe this image.\nAssistant:", + ], + # only text + [ + "User:", + "Describe this image.\nAssistant: An image of two kittens.\n", + "User:", + "Describe this image.\nAssistant:", + ], + # only images + [ + images[0], + images[1], + ], + ] + + return prompts + + def test_save_load_pretrained_additional_features(self): + with tempfile.TemporaryDirectory() as tmpdir: + processor = IdeficsProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) + processor.save_pretrained(tmpdir) + + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + + processor = IdeficsProcessor.from_pretrained( + tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, IdeficsImageProcessor) + + def test_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor) + + prompts = self.prepare_prompts() + + # test that all prompts succeeded + input_processor = processor(text=prompts, return_tensors="pt", padding="longest") + for key in self.input_keys: + assert torch.is_tensor(input_processor[key]) + + def test_tokenizer_decode(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor, return_tensors="pt") + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + def test_tokenizer_padding(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer(padding_side="right") + + processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor, return_tensors="pt") + + predicted_tokens = [ + " Describe this image.\nAssistant:", + " Describe this image.\nAssistant:", + ] + predicted_attention_masks = [ + ([1] * 10) + ([0] * 9), + ([1] * 10) + ([0] * 10), + ] + prompts = [[prompt] for prompt in self.prepare_prompts()[2]] + + max_length = processor(text=prompts, padding="max_length", truncation=True, max_length=20, return_tensors="pt") + longest = processor(text=prompts, padding="longest", truncation=True, max_length=30, return_tensors="pt") + + decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1]) + decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1]) + + self.assertEqual(decoded_max_length, predicted_tokens[1]) + self.assertEqual(decoded_longest, predicted_tokens[0]) + + self.assertListEqual(max_length["attention_mask"][-1].tolist(), predicted_attention_masks[1]) + self.assertListEqual(longest["attention_mask"][-1].tolist(), predicted_attention_masks[0]) + + def test_tokenizer_left_padding(self): + """Identical to test_tokenizer_padding, but with padding_side not explicitly set.""" + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor) + + predicted_tokens = [ + " Describe this image.\nAssistant:", + " Describe this image.\nAssistant:", + ] + predicted_attention_masks = [ + ([0] * 9) + ([1] * 10), + ([0] * 10) + ([1] * 10), + ] + prompts = [[prompt] for prompt in self.prepare_prompts()[2]] + max_length = processor(text=prompts, padding="max_length", truncation=True, max_length=20) + longest = processor(text=prompts, padding="longest", truncation=True, max_length=30) + + decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1]) + decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1]) + + self.assertEqual(decoded_max_length, predicted_tokens[1]) + self.assertEqual(decoded_longest, predicted_tokens[0]) + + self.assertListEqual(max_length["attention_mask"][-1].tolist(), predicted_attention_masks[1]) + self.assertListEqual(longest["attention_mask"][-1].tolist(), predicted_attention_masks[0]) + + def test_model_input_names(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor) + prompts = self.prepare_prompts() + + inputs = processor(text=prompts, padding="longest", return_tensors="pt") + + # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask'] + self.assertSetEqual(set(inputs.keys()), set(self.input_keys)) diff --git a/docs/transformers/tests/models/idefics2/__init__.py b/docs/transformers/tests/models/idefics2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/idefics2/test_image_processing_idefics2.py b/docs/transformers/tests/models/idefics2/test_image_processing_idefics2.py new file mode 100644 index 0000000000000000000000000000000000000000..c389645020792bc7e39238a1a1bee171aea3ca02 --- /dev/null +++ b/docs/transformers/tests/models/idefics2/test_image_processing_idefics2.py @@ -0,0 +1,310 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin + + +if is_vision_available(): + from PIL import Image + + from transformers import Idefics2ImageProcessor + + +if is_torch_available(): + import torch + + +class Idefics2ImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + num_images=1, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_rescale=True, + rescale_factor=1 / 255, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_convert_rgb=True, + do_pad=True, + do_image_splitting=True, + ): + size = size if size is not None else {"shortest_edge": 378, "longest_edge": 980} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.num_images = num_images + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_convert_rgb = do_convert_rgb + self.do_pad = do_pad + self.do_image_splitting = do_image_splitting + + def prepare_image_processor_dict(self): + return { + "do_convert_rgb": self.do_convert_rgb, + "do_resize": self.do_resize, + "size": self.size, + "do_rescale": self.do_rescale, + "rescale_factor": self.rescale_factor, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_pad": self.do_pad, + "do_image_splitting": self.do_image_splitting, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to BridgeTowerImageProcessor, + assuming do_resize is set to True with a scalar size and size_divisor. + """ + if not batched: + shortest_edge = self.size["shortest_edge"] + longest_edge = self.size["longest_edge"] + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] + else: + h, w = image.shape[1], image.shape[2] + + aspect_ratio = w / h + if w > h and w >= longest_edge: + w = longest_edge + h = int(w / aspect_ratio) + elif h > w and h >= longest_edge: + h = longest_edge + w = int(h * aspect_ratio) + w = max(w, shortest_edge) + h = max(h, shortest_edge) + expected_height = h + expected_width = w + else: + expected_values = [] + for images in image_inputs: + for image in images: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + def expected_output_image_shape(self, images): + height, width = self.get_expected_values(images, batched=True) + effective_nb_images = self.num_images * 5 if self.do_image_splitting else 1 + return effective_nb_images, self.num_channels, height, width + + def prepare_image_inputs( + self, + batch_size=None, + min_resolution=None, + max_resolution=None, + num_channels=None, + num_images=None, + size_divisor=None, + equal_resolution=False, + numpify=False, + torchify=False, + ): + """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, + or a list of PyTorch tensors if one specifies torchify=True. + + One can specify whether the images are of the same resolution or not. + """ + assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time" + + batch_size = batch_size if batch_size is not None else self.batch_size + min_resolution = min_resolution if min_resolution is not None else self.min_resolution + max_resolution = max_resolution if max_resolution is not None else self.max_resolution + num_channels = num_channels if num_channels is not None else self.num_channels + num_images = num_images if num_images is not None else self.num_images + + images_list = [] + for i in range(batch_size): + images = [] + for j in range(num_images): + if equal_resolution: + width = height = max_resolution + else: + # To avoid getting image width/height 0 + if size_divisor is not None: + # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor` + min_resolution = max(size_divisor, min_resolution) + width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2) + images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8)) + images_list.append(images) + + if not numpify and not torchify: + # PIL expects the channel dimension as last dimension + images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list] + + if torchify: + images_list = [[torch.from_numpy(image) for image in images] for images in images_list] + + if numpify: + # Numpy images are typically in channels last format + images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list] + + return images_list + + +@require_torch +@require_vision +class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = Idefics2ImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = Idefics2ImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_pad")) + self.assertTrue(hasattr(image_processing, "do_image_splitting")) + + def test_call_numpy(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for sample_images in image_inputs: + for image in sample_images: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_numpy_4_channels(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processor_dict = self.image_processor_dict + image_processor_dict["image_mean"] = [0.5, 0.5, 0.5, 0.5] + image_processor_dict["image_std"] = [0.5, 0.5, 0.5, 0.5] + image_processing = self.image_processing_class(**image_processor_dict) + # create random numpy tensors + self.image_processor_tester.num_channels = 4 + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + + for sample_images in image_inputs: + for image in sample_images: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing( + image_inputs[0], input_data_format="channels_last", return_tensors="pt" + ).pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing( + image_inputs, input_data_format="channels_last", return_tensors="pt" + ).pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_pil(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for images in image_inputs: + for image in images: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_pytorch(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + + for images in image_inputs: + for image in images: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + tuple(encoded_images.shape), + (self.image_processor_tester.batch_size, *expected_output_image_shape), + ) diff --git a/docs/transformers/tests/models/idefics2/test_modeling_idefics2.py b/docs/transformers/tests/models/idefics2/test_modeling_idefics2.py new file mode 100644 index 0000000000000000000000000000000000000000..325d9714345104440a986dd8067d2719f7c5606c --- /dev/null +++ b/docs/transformers/tests/models/idefics2/test_modeling_idefics2.py @@ -0,0 +1,694 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Idefics2 model.""" + +import copy +import tempfile +import unittest +from io import BytesIO + +import pytest +import requests + +from transformers import ( + AutoProcessor, + Idefics2Config, + Idefics2ForConditionalGeneration, + Idefics2Model, + is_torch_available, + is_vision_available, +) +from transformers.testing_utils import ( + cleanup, + require_bitsandbytes, + require_flash_attn, + require_torch, + require_torch_gpu, + require_torch_multi_gpu, + require_torch_sdpa, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + +class Idefics2VisionText2TextModelTester: + def __init__( + self, + parent, + is_training=True, + batch_size=2, + num_images=2, + seq_length=10, + vision_config={ + "image_size": 12, + "patch_size": 12, + "num_channels": 3, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 32, + "dropout": 0.1, + "attention_dropout": 0.1, + "initializer_range": 0.02, + }, + perceiver_config={ + "hidden_act": "silu", + "resampler_n_latents": 2, + "resampler_depth": 2, + "resampler_n_heads": 2, + "num_key_value_heads": 1, + "resampler_head_dim": 12, + "attention_dropout": 0.0, + }, + text_config={ + "vocab_size": 100, + "hidden_size": 64, + "intermediate_size": 56, + "num_hidden_layers": 3, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "hidden_act": "silu", + "max_position_embeddings": 256, + "initializer_range": 0.02, + "rms_norm_eps": 1e-6, + "pad_token_id": 0, # None in the original configuration_mistral, we set it to the unk_token_id + "bos_token_id": 1, + "eos_token_id": 2, + "image_token_id": 99, + "tie_word_embeddings": False, + "rope_theta": 10000.0, + "sliding_window": 32, + "attention_dropout": 0.0, + }, + use_cache=False, + tie_word_embeddings=False, + image_token_id=99, + ): + self.parent = parent + self.is_training = is_training + self.batch_size = batch_size + self.num_images = num_images + self.num_channels = 3 + self.seq_length = seq_length + self.use_cache = use_cache + self.image_token_id = image_token_id + self.tie_word_embeddings = tie_word_embeddings + # Hack - add properties here so use common tests + self.vocab_size = text_config["vocab_size"] + self.num_hidden_layers = text_config["num_hidden_layers"] + self.num_attention_heads = text_config["num_attention_heads"] + self.hidden_size = text_config["hidden_size"] + + self.vision_config = vision_config + self.perceiver_config = perceiver_config + self.text_config = text_config + + def get_config(self): + return Idefics2Config( + use_cache=self.use_cache, + image_token_id=self.image_token_id, + tie_word_embeddings=self.tie_word_embeddings, + vision_config=self.vision_config, + perceiver_config=self.perceiver_config, + text_config=self.text_config, + vocab_size=self.vocab_size, + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + self.num_images, + self.vision_config["num_channels"], + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + config = self.get_config() + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1 + + # For simplicity just set the last n tokens to the image token + n_image_tokens_per_batch = self.num_images * self.perceiver_config["resampler_n_latents"] + input_ids[:, -n_image_tokens_per_batch:] = self.image_token_id + attention_mask = input_ids.ne(1).to(torch_device) + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + +@require_torch +class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase): + """ + Model tester for `Idefics2`. + """ + + all_model_classes = (Idefics2Model,) if is_torch_available() else () + fx_compatible = False + test_torchscript = False + test_pruning = False + test_resize_embeddings = True + test_head_masking = False + _is_composite = True + + def setUp(self): + self.model_tester = Idefics2VisionText2TextModelTester(self) + self.config_tester = ConfigTester( + self, config_class=Idefics2Config, has_text_modality=False, common_properties=["image_token_id"] + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="input_embeds cannot be passed in without input_ids") + def test_inputs_embeds(): + pass + + @unittest.skip(reason="input_embeds cannot be passed in without input_ids") + def test_inputs_embeds_matches_input_ids(self): + pass + + @unittest.skip(reason="Model does not support padding right") + def test_flash_attn_2_generate_padding_right(self): + pass + + @unittest.skip(reason="Model does not support padding right") + def test_flash_attn_2_inference_padding_right(self): + pass + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_tokens_embeddings(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + if self.model_tester.is_training is False: + model.eval() + + model_vocab_size = config.text_config.vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) + + # Ignore copy + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"] + model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.image_token_id + + # make sure that decoder_input_ids are resized as well + if "decoder_input_ids" in inputs_dict: + inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1) + self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size) + self.assertTrue(model.config.text_config.vocab_size, model.vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size + target_dimension = 128 + model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0], target_dimension) + + with self.assertRaisesRegex( + ValueError, + "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer", + ): + model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3) + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_embeddings_untied(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + original_config.tie_word_embeddings = False + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config).to(torch_device) + + # if no output embeddings -> leave test + if model.get_output_embeddings() is None: + continue + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"] + model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.image_token_id + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa") + self.assertTrue(model_sdpa.connector.perceiver_resampler.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.connector.perceiver_resampler.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + +@require_torch +class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTesterMixin, unittest.TestCase): + """ + Model tester for `Idefics2ForConditionalGeneration`. + """ + + all_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": Idefics2ForConditionalGeneration} if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_resize_embeddings = True + test_head_masking = False + test_torchscript = False + + def setUp(self): + self.model_tester = Idefics2VisionText2TextModelTester(self) + self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False) + + @unittest.skip(reason="input_embeds cannot be passed in without input_ids") + def test_inputs_embeds(): + pass + + @unittest.skip(reason="Model does not support padding right") + def test_flash_attn_2_generate_padding_right(self): + pass + + @unittest.skip(reason="Model does not support padding right") + def test_flash_attn_2_inference_padding_right(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip( + reason="Prompt lookup decoding needs a way to indicate `bad_word_ids` that should not be suggested as candidates" + ) + def test_prompt_lookup_decoding_matches_greedy_search(self): + pass + + @pytest.mark.generate + @require_torch_sdpa + @slow + @unittest.skip( + reason="Idefics2 doesn't support SDPA for all backbones, vision backbones has only eager/FA2 attention" + ) + def test_eager_matches_sdpa_generate(self): + pass + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_tokens_embeddings(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + model_vocab_size = config.text_config.vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"] + model.model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id + + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1) + self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size) + self.assertTrue(model.config.text_config.vocab_size, model.vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size + target_dimension = 128 + model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0], target_dimension) + + with self.assertRaisesRegex( + ValueError, + "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer", + ): + model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3) + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_embeddings_untied(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + original_config.tie_word_embeddings = False + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config).to(torch_device) + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"] + model.model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + def test_inputs_embeds_matches_input_ids_with_generate(self): + # overwrite because IDEFICS needs ids and embeds at the input to be not None + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 + + wte = model.get_input_embeddings() + + input_ids = inputs["input_ids"] + # some models infer position ids/attn mask differently when input ids + # by check if pad_token let's make sure no padding is in input ids + not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 + input_ids[input_ids == pad_token_id] = not_pad_token_id + del inputs["input_ids"] + inputs_embeds = wte(input_ids) + out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2) + out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) + + torch.testing.assert_close(out_embeds, out_ids) + + +@require_torch +class Idefics2ForConditionalGenerationIntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-base") + self.image1 = Image.open( + BytesIO( + requests.get( + "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" + ).content + ) + ) + self.image2 = Image.open( + BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content) + ) + self.image3 = Image.open( + BytesIO( + requests.get( + "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg" + ).content + ) + ) + + def tearDown(self): + cleanup(torch_device, gc_collect=True) + + @slow + @require_torch_multi_gpu + def test_integration_test(self): + model = Idefics2ForConditionalGeneration.from_pretrained( + "HuggingFaceM4/idefics2-8b-base", + torch_dtype=torch.bfloat16, + device_map="auto", + ) + + # Create inputs + text = "In this image, we see" + images = self.image1 + inputs = self.processor(text=text, images=images, return_tensors="pt", padding=True) + inputs.to(torch_device) + + generated_ids = model.generate(**inputs, max_new_tokens=10) + generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + # Batch affects generated text. Single batch output: ['In this image, we see the Statue of Liberty in the foreground and'] + expected_generated_text = "In this image, we see the Statue of Liberty, the New York City" + self.assertEqual(generated_texts[0], expected_generated_text) + + @slow + @require_bitsandbytes + def test_integration_test_4bit(self): + # Let' s make sure we test the preprocessing to replace what is used + model = Idefics2ForConditionalGeneration.from_pretrained( + "HuggingFaceM4/idefics2-8b-base", + load_in_4bit=True, + ) + + # Create pixel inputs + text = ["In this image, we see", "bla, bla "] + images = [[self.image1], [self.image2, self.image3]] + inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt").to(torch_device) + + generated_ids = model.generate(**inputs, max_new_tokens=10) + generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + expected_generated_text = "In this image, we see the Statue of Liberty, the Hudson River," + self.assertEqual(generated_texts[0], expected_generated_text) + + @slow + @require_bitsandbytes + def test_integration_test_4bit_batch2(self): + # Let' s make sure we test the preprocessing to replace what is used + + model = Idefics2ForConditionalGeneration.from_pretrained( + "HuggingFaceM4/idefics2-8b-base", + load_in_4bit=True, + ) + + from datasets import load_dataset + + dataset = load_dataset("nielsr/docvqa_1200_examples", split="test") + + text = [f"{dataset[40]['query']['en']}", f"{dataset[41]['query']['en']}"] + images = [[dataset[40]["image"]], [dataset[41]["image"]]] + inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt").to(torch_device) + generated_ids = model.generate(**inputs, max_new_tokens=64) + batched_generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + text = f"{dataset[40]['query']['en']}" + images = dataset[40]["image"] + inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt").to(torch_device) + generated_ids = model.generate(**inputs, max_new_tokens=64) + generated_text_0 = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + text = f"{dataset[41]['query']['en']}" + images = dataset[41]["image"] + inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt").to(torch_device) + generated_ids = model.generate(**inputs, max_new_tokens=64) + generated_text_1 = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + self.assertEqual(batched_generated_texts[0], generated_text_0[0]) + self.assertEqual(batched_generated_texts[1], generated_text_1[0]) + + @require_flash_attn + @require_torch_gpu + @require_bitsandbytes + def test_flash_attn_2_eager_equivalence(self): + # Create inputs + text = "In this image, we see" + images = self.image1 + inputs = self.processor(text=text, images=images, return_tensors="pt", padding=True) + inputs.to(torch_device) + + # Eager model + model_eager = Idefics2ForConditionalGeneration.from_pretrained( + "HuggingFaceM4/idefics2-8b-base", + attn_implementation="eager", + load_in_4bit=True, + ) + generated_ids_eager = model_eager.generate(**inputs, max_new_tokens=10) + generated_texts_eager = self.processor.batch_decode(generated_ids_eager, skip_special_tokens=True) + + del model_eager + + # Flash Attention 2 model + model_flash_attention_2 = Idefics2ForConditionalGeneration.from_pretrained( + "HuggingFaceM4/idefics2-8b-base", + attn_implementation="flash_attention_2", + load_in_4bit=True, + ) + generated_ids_flash_attention_2 = model_flash_attention_2.generate(**inputs, max_new_tokens=10) + generated_texts_flash_attention_2 = self.processor.batch_decode( + generated_ids_flash_attention_2, skip_special_tokens=True + ) + + self.assertEqual(generated_texts_eager[0], generated_texts_flash_attention_2[0]) diff --git a/docs/transformers/tests/models/idefics2/test_processor_idefics2.py b/docs/transformers/tests/models/idefics2/test_processor_idefics2.py new file mode 100644 index 0000000000000000000000000000000000000000..a39d14d4f17ee6d9613662873c220060e7787685 --- /dev/null +++ b/docs/transformers/tests/models/idefics2/test_processor_idefics2.py @@ -0,0 +1,334 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest +from io import BytesIO + +import requests + +from transformers import Idefics2Processor +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from PIL import Image + + from transformers import ( + AutoProcessor, + Idefics2Processor, + ) + + +@require_torch +@require_vision +class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = Idefics2Processor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2) + + processor.save_pretrained(cls.tmpdirname) + + cls.image1 = Image.open( + BytesIO( + requests.get( + "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" + ).content + ) + ) + cls.image2 = Image.open( + BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content) + ) + cls.image3 = Image.open( + BytesIO( + requests.get( + "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg" + ).content + ) + ) + cls.bos_token = processor.tokenizer.bos_token + cls.image_token = processor.image_token + cls.fake_image_token = processor.fake_image_token + + cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token) + cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token) + cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token) + cls.image_seq_len = processor.image_seq_len + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + def get_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) + + @staticmethod + def prepare_processor_dict(): + return {"image_seq_len": 2} + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + def test_process_interleaved_images_prompts_no_image_splitting(self): + tokenizer = self.get_tokenizer() + processor = self.get_processor() + + processor.image_processor.do_image_splitting = False + + # Test that a single image is processed correctly + inputs = processor(images=self.image1) + self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980)) + self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980)) + # fmt: on + + # Test a single sample with image and text + image_str = "" + text_str = "In this image, we see" + text = image_str + text_str + inputs = processor(text=text, images=self.image1) + + # fmt: off + tokenized_sentence = tokenizer(text_str, add_special_tokens=False) + expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]] + self.assertEqual(inputs["input_ids"], expected_input_ids) + self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) + self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980)) + self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980)) + # fmt: on + + # Test that batch is correctly processed + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "bla, bla" + + text = [ + image_str + text_str_1, + text_str_2 + image_str + image_str, + ] + images = [[self.image1], [self.image2, self.image3]] + + inputs = processor(text=text, images=images, padding=True) + + # fmt: off + tokenized_sentence_1 = tokenizer(text_str_1, add_special_tokens=False) + tokenized_sentence_2 = tokenizer(text_str_2, add_special_tokens=False) + expected_input_ids_1 = [self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"] + expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + # Pad the first input to match the second input + pad_len = len(expected_input_ids_2) - len(expected_input_ids_1) + padded_expected_input_ids_1 = [0] * pad_len + expected_input_ids_1 + + self.assertEqual( + inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2] + ) + self.assertEqual( + inputs["attention_mask"], + [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)] + ) + self.assertEqual(inputs['pixel_values'].shape, (2, 2, 3, 767, 980)) + self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 2, 767, 980)) + # fmt: on + + def test_process_interleaved_images_prompts_image_splitting(self): + processor = self.get_processor() + tokenizer = self.get_tokenizer() + processor.image_processor.do_image_splitting = True + + # Test that a single image is processed correctly + inputs = processor(images=self.image1) + self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980)) + self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980)) + # fmt: on + + # Test a single sample with image and text + image_str = "" + text_str = "In this image, we see" + text = image_str + text_str + inputs = processor(text=text, images=self.image1) + + # fmt: off + tokenized_sentence = tokenizer(text_str, add_special_tokens=False) + expected_input_ids = [[self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence["input_ids"]] + self.assertEqual(inputs["input_ids"], expected_input_ids) + self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) + self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980)) + self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980)) + # fmt: on + + # Test that batch is correctly processed + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "bla, bla" + + text = [ + image_str + text_str_1, + text_str_2 + image_str + image_str, + ] + images = [[self.image1], [self.image2, self.image3]] + + inputs = processor(text=text, images=images, padding=True) + + # fmt: off + tokenized_sentence_1 = tokenizer(text_str_1, add_special_tokens=False) + tokenized_sentence_2 = tokenizer(text_str_2, add_special_tokens=False) + expected_input_ids_1 = [self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"] + expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + # Pad the first input to match the second input + pad_len = len(expected_input_ids_2) - len(expected_input_ids_1) + padded_expected_input_ids_1 = [0] * pad_len + expected_input_ids_1 + + self.assertEqual( + inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2] + ) + self.assertEqual( + inputs["attention_mask"], + [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)] + ) + self.assertEqual(inputs['pixel_values'].shape, (2, 10, 3, 767, 980)) + self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 10, 767, 980)) + # fmt: on + + def test_add_special_tokens_processor(self): + processor = self.get_processor() + tokenizer = self.get_tokenizer() + image_str = "" + text_str = "In this image, we see" + text = text_str + image_str + + n_image_repeat = 5 if processor.image_processor.do_image_splitting else 1 + + # fmt: off + inputs = processor(text=text, images=self.image1, add_special_tokens=False) + tokenized_sentence = tokenizer(text_str, add_special_tokens=False) + expected_input_ids = [tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]] + self.assertEqual(inputs["input_ids"], expected_input_ids) + + inputs = processor(text=text, images=self.image1) + expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]] + self.assertEqual(inputs["input_ids"], expected_input_ids) + # fmt: on + + def test_non_nested_images_with_batched_text(self): + processor = self.get_processor() + processor.image_processor.do_image_splitting = False + + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "bla, bla" + + text = [ + image_str + text_str_1, + text_str_2 + image_str + image_str, + ] + images = [self.image1, self.image2, self.image3] + + inputs = processor(text=text, images=images, padding=True) + + self.assertEqual(inputs["pixel_values"].shape, (2, 2, 3, 767, 980)) + self.assertEqual(inputs["pixel_attention_mask"].shape, (2, 2, 767, 980)) + + def test_process_interleaved_images_prompts_image_error(self): + processor = self.get_processor() + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2, self.image3]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2, self.image3] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + def test_apply_chat_template(self): + # Message contains content which a mix of lists with images and image urls and string + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What do these images show?"}, + {"type": "image"}, + {"type": "image"}, + "What do these images show?", + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.", + } + ], + }, + {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]}, + ] + + processor = self.get_processor() + # Make short sequence length to test that the fake tokens are added correctly + rendered = processor.apply_chat_template(messages, add_generation_prompt=True) + + expected_rendered = ( + "User: What do these images show?\n" + "Assistant: The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.\n" + "User: And who is that?\n" + "Assistant:" + ) + self.assertEqual(rendered, expected_rendered) diff --git a/docs/transformers/tests/models/idefics3/__init__.py b/docs/transformers/tests/models/idefics3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/idefics3/test_image_processing_idefics3.py b/docs/transformers/tests/models/idefics3/test_image_processing_idefics3.py new file mode 100644 index 0000000000000000000000000000000000000000..01ea063aa3e713cba6e690689e3f68fe4788fdcb --- /dev/null +++ b/docs/transformers/tests/models/idefics3/test_image_processing_idefics3.py @@ -0,0 +1,283 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers.image_utils import PILImageResampling +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin + + +if is_vision_available(): + from PIL import Image + + from transformers import Idefics3ImageProcessor + + +if is_torch_available(): + import torch + + +class Idefics3ImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + num_images=1, + image_size=18, + min_resolution=30, + max_resolution=40, + do_resize=True, + size=None, + max_image_size=None, + do_rescale=True, + rescale_factor=1 / 255, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_convert_rgb=True, + do_pad=True, + do_image_splitting=True, + resample=PILImageResampling.LANCZOS, + ): + self.size = size if size is not None else {"longest_edge": max_resolution} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.num_images = num_images + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.resample = resample + self.do_image_splitting = do_image_splitting + self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 20} + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + self.do_pad = do_pad + + def prepare_image_processor_dict(self): + return { + "do_convert_rgb": self.do_convert_rgb, + "do_resize": self.do_resize, + "size": self.size, + "max_image_size": self.max_image_size, + "do_rescale": self.do_rescale, + "rescale_factor": self.rescale_factor, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_pad": self.do_pad, + "do_image_splitting": self.do_image_splitting, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to Idefics3ImageProcessor, + assuming do_resize is set to True. The expected size in that case the max image size. + """ + return self.max_image_size["longest_edge"], self.max_image_size["longest_edge"] + + def expected_output_image_shape(self, images): + height, width = self.get_expected_values(images, batched=True) + effective_nb_images = ( + self.num_images * 5 if self.do_image_splitting else 1 + ) # 5 is a squared image divided into 4 + global image resized + return effective_nb_images, self.num_channels, height, width + + def prepare_image_inputs( + self, + batch_size=None, + min_resolution=None, + max_resolution=None, + num_channels=None, + num_images=None, + size_divisor=None, + equal_resolution=False, + numpify=False, + torchify=False, + ): + """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, + or a list of PyTorch tensors if one specifies torchify=True. + + One can specify whether the images are of the same resolution or not. + """ + assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time" + + batch_size = batch_size if batch_size is not None else self.batch_size + min_resolution = min_resolution if min_resolution is not None else self.min_resolution + max_resolution = max_resolution if max_resolution is not None else self.max_resolution + num_channels = num_channels if num_channels is not None else self.num_channels + num_images = num_images if num_images is not None else self.num_images + + images_list = [] + for i in range(batch_size): + images = [] + for j in range(num_images): + if equal_resolution: + width = height = max_resolution + else: + # To avoid getting image width/height 0 + if size_divisor is not None: + # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor` + min_resolution = max(size_divisor, min_resolution) + width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2) + images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8)) + images_list.append(images) + + if not numpify and not torchify: + # PIL expects the channel dimension as last dimension + images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list] + + if torchify: + images_list = [[torch.from_numpy(image) for image in images] for images in images_list] + + if numpify: + # Numpy images are typically in channels last format + images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list] + + return images_list + + +@require_torch +@require_vision +class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = Idefics3ImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = Idefics3ImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "resample")) + self.assertTrue(hasattr(image_processing, "do_image_splitting")) + self.assertTrue(hasattr(image_processing, "max_image_size")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_pad")) + self.assertTrue(hasattr(image_processing, "do_image_splitting")) + + def test_call_numpy(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for sample_images in image_inputs: + for image in sample_images: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_numpy_4_channels(self): + # Idefics3 always processes images as RGB, so it always returns images with 3 channels + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processor_dict = self.image_processor_dict + image_processing = self.image_processing_class(**image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + + for sample_images in image_inputs: + for image in sample_images: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_pil(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for images in image_inputs: + for image in images: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_pytorch(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + + for images in image_inputs: + for image in images: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + tuple(encoded_images.shape), + (self.image_processor_tester.batch_size, *expected_output_image_shape), + ) diff --git a/docs/transformers/tests/models/idefics3/test_modeling_idefics3.py b/docs/transformers/tests/models/idefics3/test_modeling_idefics3.py new file mode 100644 index 0000000000000000000000000000000000000000..69a0f85acefa869cede10d772640b01634952921 --- /dev/null +++ b/docs/transformers/tests/models/idefics3/test_modeling_idefics3.py @@ -0,0 +1,565 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Idefics3 model.""" + +import copy +import unittest +from io import BytesIO + +import pytest +import requests + +from transformers import ( + AutoProcessor, + is_torch_available, + is_vision_available, +) +from transformers.testing_utils import ( + cleanup, + require_bitsandbytes, + require_torch, + require_torch_sdpa, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor + + +if is_torch_available(): + import torch + + from transformers import ( + Idefics3Config, + Idefics3ForConditionalGeneration, + Idefics3Model, + ) + +if is_vision_available(): + from PIL import Image + + +class Idefics3VisionText2TextModelTester: + def __init__( + self, + parent, + is_training=True, + batch_size=2, + scale_factor=2, + num_images=2, + vision_config={ + "image_size": 16, + "patch_size": 4, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 32, + "dropout": 0.1, + "attention_dropout": 0.1, + "initializer_range": 0.02, + }, + text_config={ + "vocab_size": 100, + "hidden_size": 64, + "intermediate_size": 56, + "num_hidden_layers": 3, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "hidden_act": "silu", + "max_position_embeddings": 256, + "initializer_range": 0.02, + "rms_norm_eps": 1e-6, + "pad_token_id": 2, + "bos_token_id": 0, + "eos_token_id": 1, + "image_token_id": 57, + "tie_word_embeddings": False, + "rope_theta": 10000.0, + "sliding_window": 32, + "attention_dropout": 0.0, + }, + use_cache=False, + tie_word_embeddings=False, + image_token_id=57, + ): + self.parent = parent + self.is_training = is_training + self.batch_size = batch_size + self.num_images = num_images + self.scale_factor = scale_factor + self.seq_length = ( + int(((vision_config["image_size"] // vision_config["patch_size"]) ** 2) / (self.scale_factor**2)) + * self.num_images + ) + self.use_cache = use_cache + self.image_token_id = image_token_id + self.tie_word_embeddings = tie_word_embeddings + # Hack - add properties here so use common tests + self.vocab_size = text_config["vocab_size"] + self.num_hidden_layers = text_config["num_hidden_layers"] + self.num_attention_heads = text_config["num_attention_heads"] + self.hidden_size = text_config["hidden_size"] + + self.vision_config = vision_config + self.text_config = text_config + + def get_config(self): + return Idefics3Config( + use_cache=self.use_cache, + image_token_id=self.image_token_id, + tie_word_embeddings=self.tie_word_embeddings, + vision_config=self.vision_config, + text_config=self.text_config, + vocab_size=self.vocab_size, + scale_factor=self.scale_factor, + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + self.num_images, + 3, # Idefics3ImageProcessor always generates RGB pixel values + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + config = self.get_config() + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1 + + # For simplicity just set the last n tokens to the image token + n_image_tokens_per_batch = self.seq_length + input_ids[:, -n_image_tokens_per_batch:] = self.image_token_id + attention_mask = input_ids.ne(1).to(torch_device) + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + +@require_torch +class Idefics3ModelTest(ModelTesterMixin, unittest.TestCase): + """ + Model tester for `Idefics3`. + """ + + all_model_classes = (Idefics3Model,) if is_torch_available() else () + fx_compatible = False + test_torchscript = False + test_pruning = False + test_resize_embeddings = True + test_head_masking = False + + def setUp(self): + self.model_tester = Idefics3VisionText2TextModelTester(self) + self.config_tester = ConfigTester( + self, config_class=Idefics3Config, has_text_modality=False, common_properties=["image_token_id"] + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="input_embeds cannot be passed in without input_ids") + def test_inputs_embeds(): + pass + + @unittest.skip(reason="input_embeds cannot be passed in without input_ids") + def test_inputs_embeds_matches_input_ids(self): + pass + + @unittest.skip(reason="Model does not support padding right") + def test_flash_attn_2_inference_padding_right(self): + pass + + @unittest.skip(reason="Compile not yet supported in idefics3 models") + def test_sdpa_can_compile_dynamic(self): + pass + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_tokens_embeddings(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + if self.model_tester.is_training is False: + model.eval() + + model_vocab_size = config.text_config.vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) + + # Ignore copy + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.seq_length + model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.image_token_id + + # make sure that decoder_input_ids are resized as well + if "decoder_input_ids" in inputs_dict: + inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1) + self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size) + self.assertTrue(model.config.text_config.vocab_size, model.vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size + target_dimension = 128 + model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0], target_dimension) + + with self.assertRaisesRegex( + ValueError, + "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer", + ): + model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3) + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_embeddings_untied(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + original_config.tie_word_embeddings = False + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config).to(torch_device) + + # if no output embeddings -> leave test + if model.get_output_embeddings() is None: + continue + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.seq_length + model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.image_token_id + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + +@require_torch +class Idefics3ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTesterMixin, unittest.TestCase): + """ + Model tester for `Idefics3ForConditionalGeneration`. + """ + + all_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": Idefics3ForConditionalGeneration} if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_resize_embeddings = True + test_head_masking = False + test_torchscript = False + + def setUp(self): + self.model_tester = Idefics3VisionText2TextModelTester(self) + self.config_tester = ConfigTester(self, config_class=Idefics3Config, has_text_modality=False) + + @unittest.skip(reason="input_embeds cannot be passed in without input_ids") + def test_inputs_embeds(): + pass + + @unittest.skip(reason="Model does not support padding right") + def test_flash_attn_2_inference_padding_right(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip( + reason="Prompt lookup decoding needs a way to indicate `bad_word_ids` that should not be suggested as candidates" + ) + def test_prompt_lookup_decoding_matches_greedy_search(self): + pass + + @pytest.mark.generate + @require_torch_sdpa + @slow + @unittest.skip( + reason="Idefics3 doesn't support SDPA for all backbones, vision backbones has only eager/FA2 attention" + ) + def test_eager_matches_sdpa_generate(self): + pass + + @unittest.skip(reason="Compile not yet supported in Idefics3 models end-to-end") + def test_sdpa_can_compile_dynamic(self): + pass + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_tokens_embeddings(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + model_vocab_size = config.text_config.vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.seq_length + model.model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id + + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1) + self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size) + self.assertTrue(model.config.text_config.vocab_size, model.vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size + target_dimension = 128 + model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0], target_dimension) + + with self.assertRaisesRegex( + ValueError, + "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer", + ): + model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3) + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_embeddings_untied(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + original_config.tie_word_embeddings = False + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config).to(torch_device) + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.seq_length + model.model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + +@require_torch +class Idefics3ForConditionalGenerationIntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3") + self.image1 = Image.open( + BytesIO( + requests.get( + "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" + ).content + ) + ) + self.image2 = Image.open( + BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content) + ) + self.image3 = Image.open( + BytesIO( + requests.get( + "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg" + ).content + ) + ) + + def tearDown(self): + cleanup(torch_device, gc_collect=True) + + @slow + @unittest.skip("multi-gpu tests are disabled for now") + def test_integration_test(self): + model = Idefics3ForConditionalGeneration.from_pretrained( + "HuggingFaceM4/Idefics3-8B-Llama3", + torch_dtype=torch.bfloat16, + device_map="auto", + ) + + # Create inputs + text = "In this image, we see" + images = self.image1 + inputs = self.processor(text=text, images=images, return_tensors="pt", padding=True) + inputs.to(torch_device) + + generated_ids = model.generate(**inputs, max_new_tokens=10) + generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + expected_generated_text = "In this image, we see the Statue of Liberty, which is located on Liberty" + self.assertEqual(generated_texts[0], expected_generated_text) + + @slow + @require_bitsandbytes + @unittest.skip("multi-gpu tests are disabled for now") + def test_integration_test_4bit(self): + # Let' s make sure we test the preprocessing to replace what is used + model = Idefics3ForConditionalGeneration.from_pretrained( + "HuggingFaceM4/Idefics3-8B-Llama3", + load_in_4bit=True, + device_map="auto", + ) + + # Create pixel inputs + text = ["In this image, we see", "bla, bla "] + images = [[self.image1], [self.image2, self.image3]] + inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt") + + generated_ids = model.generate(**inputs, max_new_tokens=10) + generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + expected_generated_text = "In this image, we see the Statue of Liberty, trees, buildings, water" + self.assertEqual(generated_texts[0], expected_generated_text) diff --git a/docs/transformers/tests/models/idefics3/test_processor_idefics3.py b/docs/transformers/tests/models/idefics3/test_processor_idefics3.py new file mode 100644 index 0000000000000000000000000000000000000000..99b931a12c280c4f7cbf474a4aeb988c008804dd --- /dev/null +++ b/docs/transformers/tests/models/idefics3/test_processor_idefics3.py @@ -0,0 +1,427 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest +from io import BytesIO + +import numpy as np +import requests + +from transformers import Idefics3Processor +from transformers.models.auto.processing_auto import AutoProcessor +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from PIL import Image + + +@require_torch +@require_vision +class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = Idefics3Processor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2) + processor.save_pretrained(cls.tmpdirname) + cls.image1 = Image.open( + BytesIO( + requests.get( + "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" + ).content + ) + ) + cls.image2 = Image.open( + BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content) + ) + cls.image3 = Image.open( + BytesIO( + requests.get( + "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg" + ).content + ) + ) + cls.bos_token = processor.tokenizer.bos_token + cls.image_token = processor.image_token + cls.fake_image_token = processor.fake_image_token + cls.global_img_token = processor.global_image_tag + + cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token) + cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token) + cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token) + cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"] + cls.padding_token_id = processor.tokenizer.pad_token_id + cls.image_seq_len = processor.image_seq_len + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + def get_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) + + @staticmethod + def prepare_processor_dict(): + return {"image_seq_len": 2} + + def get_split_image_expected_tokens(self, processor, image_rows, image_cols): + text_split_images = [] + for n_h in range(image_rows): + for n_w in range(image_cols): + text_split_images += ( + [self.fake_image_token_id] + + processor.tokenizer(f"", add_special_tokens=False)["input_ids"] + + [self.image_token_id] * self.image_seq_len + ) + text_split_images += processor.tokenizer("\n", add_special_tokens=False)["input_ids"] + text_split_images = text_split_images[:-1] # remove last newline + # add double newline, as it gets its own token + text_split_images += processor.tokenizer("\n\n", add_special_tokens=False)["input_ids"] + text_split_images += ( + [self.fake_image_token_id] + + self.global_img_tokens_id + + [self.image_token_id] * self.image_seq_len + + [self.fake_image_token_id] + ) + return text_split_images + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + def test_process_interleaved_images_prompts_no_image_splitting(self): + processor = self.get_processor() + processor.image_processor.do_image_splitting = False + + # Test that a single image is processed correctly + inputs = processor(images=self.image1) + image1_expected_size = (364, 364) + self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 1, 3, *image1_expected_size)) + self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 1, *image1_expected_size)) + # fmt: on + + # Test a single sample with image and text + image_str = "" + text_str = "In this image, we see" + text = image_str + text_str + inputs = processor(text=text, images=self.image1) + + # fmt: off + tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False) + expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]] + self.assertEqual(inputs["input_ids"], expected_input_ids) + self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) + self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 1, 3, *image1_expected_size)) + self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 1, *image1_expected_size)) + # fmt: on + + # Test that batch is correctly processed + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "In this image, we see" + + text = [ + image_str + text_str_1, + image_str + image_str + text_str_2, + ] + images = [[self.image1], [self.image2, self.image3]] + + inputs = processor(text=text, images=images, padding=True) + + # fmt: off + tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False) + tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False) + image_tokens = [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + expected_input_ids_1 = [self.bos_token_id] + image_tokens + tokenized_sentence_1["input_ids"] + expected_input_ids_2 = [self.bos_token_id] + 2 * image_tokens + tokenized_sentence_2["input_ids"] + # Pad the first input to match the second input + pad_len = len(expected_input_ids_2) - len(expected_input_ids_1) + padded_expected_input_ids_1 = [self.padding_token_id] * pad_len + expected_input_ids_1 + + self.assertEqual( + inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2] + ) + self.assertEqual( + inputs["attention_mask"], + [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)] + ) + self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 2, 3, 364, 364)) + self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 2, 364, 364)) + # fmt: on + + def test_process_interleaved_images_prompts_image_splitting(self): + processor = self.get_processor() + processor.image_processor.do_image_splitting = True + + # Test that a single image is processed correctly + inputs = processor(images=self.image1) + self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 13, 3, 364, 364)) + self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 13, 364, 364)) + # fmt: on + self.maxDiff = None + + # Test a single sample with image and text + image_str = "" + text_str = "In this image, we see" + text = image_str + text_str + inputs = processor(text=text, images=self.image1) + + # fmt: off + tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False) + split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4) + expected_input_ids_1 = [[self.bos_token_id] + split_image1_tokens + tokenized_sentence["input_ids"]] + self.assertEqual(inputs["input_ids"], expected_input_ids_1) + self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids_1[0])]) + self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 13, 3, 364, 364)) + self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 13, 364, 364)) + # fmt: on + + # Test that batch is correctly processed + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "bla, bla" + + text = [ + image_str + text_str_1, + text_str_2 + image_str + image_str, + ] + images = [[self.image1], [self.image2, self.image3]] + + inputs = processor(text=text, images=images, padding=True) + + # fmt: off + tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False) + tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False) + + split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4) + split_image2_tokens = self.get_split_image_expected_tokens(processor, 4, 4) + split_image3_tokens = self.get_split_image_expected_tokens(processor, 3, 4) + expected_input_ids_1 = [self.bos_token_id] + split_image1_tokens + tokenized_sentence_1["input_ids"] + expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + split_image2_tokens + split_image3_tokens + # Pad the first input to match the second input + pad_len = len(expected_input_ids_2) - len(expected_input_ids_1) + padded_expected_input_ids_1 = [self.padding_token_id] * pad_len + expected_input_ids_1 + + self.assertEqual( + inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2] + ) + self.assertEqual( + inputs["attention_mask"], + [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)] + ) + self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 30, 3, 364, 364)) + self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 30, 364, 364)) + # fmt: on + + def test_add_special_tokens_processor(self): + processor = self.get_processor() + + image_str = "" + text_str = "In this image, we see" + text = text_str + image_str + + # fmt: off + inputs = processor(text=text, images=self.image1, add_special_tokens=False) + tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False) + split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4) + expected_input_ids = [tokenized_sentence["input_ids"] + split_image1_tokens] + self.assertEqual(inputs["input_ids"], expected_input_ids) + + inputs = processor(text=text, images=self.image1) + expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + split_image1_tokens] + self.assertEqual(inputs["input_ids"], expected_input_ids) + # fmt: on + + def test_non_nested_images_with_batched_text(self): + processor = self.get_processor() + processor.image_processor.do_image_splitting = False + + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "In this image, we see" + + text = [ + image_str + text_str_1, + image_str + image_str + text_str_2, + ] + images = [self.image1, self.image2, self.image3] + + inputs = processor(text=text, images=images, padding=True) + + self.assertEqual(np.array(inputs["pixel_values"]).shape, (2, 2, 3, 364, 364)) + self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (2, 2, 364, 364)) + + # Copied from tests.models.idefics2.test_processor_idefics2.Idefics2ProcessorTest.test_process_interleaved_images_prompts_image_error + def test_process_interleaved_images_prompts_image_error(self): + processor = self.get_processor() + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2, self.image3]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2, self.image3] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + def test_apply_chat_template(self): + # Message contains content which a mix of lists with images and image urls and string + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What do these images show?"}, + {"type": "image"}, + {"type": "image"}, + "What do these images show?", + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.", + } + ], + }, + {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]}, + ] + processor = self.get_processor() + # Make short sequence length to test that the fake tokens are added correctly + rendered = processor.apply_chat_template(messages, add_generation_prompt=True) + + expected_rendered = ( + "<|begin_of_text|>User: What do these images show?\n" + "Assistant: The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.\n" + "User: And who is that?\n" + "Assistant:" + ) + self.assertEqual(rendered, expected_rendered) + + @require_torch + @require_vision + def test_text_only_inference(self): + """Test that the processor works correctly with text-only input.""" + processor = self.get_processor() + + text = "This is a simple text without images." + inputs = processor(text=text) + + tokenized_sentence = processor.tokenizer(text, add_special_tokens=False) + expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]] + + self.assertEqual(inputs["input_ids"], expected_input_ids) + self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) + self.assertTrue("pixel_values" not in inputs) + self.assertTrue("pixel_attention_mask" not in inputs) + + # Test batch of texts without image tokens + texts = ["First text.", "Second piece of text."] + batch_inputs = processor(text=texts, padding=True) + + tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False) + tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False) + + expected_1 = [self.bos_token_id] + tokenized_1["input_ids"] + expected_2 = [self.bos_token_id] + tokenized_2["input_ids"] + + # Pad the shorter sequence + pad_len = len(expected_2) - len(expected_1) + if pad_len > 0: + padded_expected_1 = [self.padding_token_id] * pad_len + expected_1 + expected_attention_1 = [0] * pad_len + [1] * len(expected_1) + self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2]) + self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)]) + else: + pad_len = -pad_len + padded_expected_2 = [self.padding_token_id] * pad_len + expected_2 + expected_attention_2 = [0] * pad_len + [1] * len(expected_2) + self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2]) + self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2]) + + @require_torch + @require_vision + def test_missing_images_error(self): + """Test that appropriate error is raised when images are referenced but not provided.""" + processor = self.get_processor() + + # Test single text with image token but no image + text = "Let me show you this image: What do you think?" + with self.assertRaises(ValueError) as context: + processor(text=text) + self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) + + # Test batch with image tokens but no images + texts = [ + "First text with token.", + "Second text with token.", + ] + with self.assertRaises(ValueError) as context: + processor(text=texts) + self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) + + # Test with None as Images + with self.assertRaises(ValueError) as context: + processor(text=text, images=None) + self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) + + with self.assertRaises(ValueError) as context: + processor(text=texts, images=None) + self.assertTrue("tokens in the text but no images were passed" in str(context.exception)) diff --git a/docs/transformers/tests/models/ijepa/__init__.py b/docs/transformers/tests/models/ijepa/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/ijepa/test_modeling_ijepa.py b/docs/transformers/tests/models/ijepa/test_modeling_ijepa.py new file mode 100644 index 0000000000000000000000000000000000000000..1049a4abeb19d2841e317e787ab2b732d8554cad --- /dev/null +++ b/docs/transformers/tests/models/ijepa/test_modeling_ijepa.py @@ -0,0 +1,341 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch IJEPA model.""" + +import unittest + +from transformers import IJepaConfig +from transformers.testing_utils import ( + require_accelerate, + require_torch, + require_torch_accelerator, + require_torch_fp16, + require_vision, + slow, + torch_device, +) +from transformers.utils import ( + cached_property, + is_torch_available, + is_vision_available, +) + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import IJepaForImageClassification, IJepaModel + + +if is_vision_available(): + from PIL import Image + + from transformers import ViTImageProcessor + + +class IJepaModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + type_sequence_label_size=10, + initializer_range=0.02, + scope=None, + encoder_stride=2, + mask_ratio=0.5, + attn_implementation="eager", + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.scope = scope + self.encoder_stride = encoder_stride + self.attn_implementation = attn_implementation + + # in IJEPA, the seq length equals the number of patches (we don't add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + self.mask_ratio = mask_ratio + self.num_masks = int(mask_ratio * self.seq_length) + self.mask_length = num_patches + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + self.num_channels, + self.image_size, + self.image_size, + ] + ) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return IJepaConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + is_decoder=False, + initializer_range=self.initializer_range, + encoder_stride=self.encoder_stride, + attn_implementation=self.attn_implementation, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = IJepaModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, self.seq_length, self.hidden_size), + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.type_sequence_label_size + model = IJepaForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual( + result.logits.shape, + (self.batch_size, self.type_sequence_label_size), + ) + + # test greyscale images + config.num_channels = 1 + model = IJepaForImageClassification(config) + model.to(torch_device) + model.eval() + + pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size]) + result = model(pixel_values) + self.parent.assertEqual( + result.logits.shape, + (self.batch_size, self.type_sequence_label_size), + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + pixel_values, + labels, + ) = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class IJepaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as IJEPA does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + ( + IJepaModel, + IJepaForImageClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + {"image-feature-extraction": IJepaModel, "image-classification": IJepaForImageClassification} + if is_torch_available() + else {} + ) + fx_compatible = True + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + test_torch_exportable = True + + def setUp(self): + self.model_tester = IJepaModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=IJepaConfig, + has_text_modality=False, + hidden_size=37, + ) + + @unittest.skip( + "Since `torch==2.3+cu121`, although this test passes, many subsequent tests have `CUDA error: misaligned address`." + "If `nvidia-xxx-cu118` are also installed, no failure (even with `torch==2.3+cu121`)." + ) + def test_multi_gpu_data_parallel_forward(self): + super().test_multi_gpu_data_parallel_forward() + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="IJEPA does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/ijepa_vith14_1k" + model = IJepaModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class IJepaModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ViTImageProcessor.from_pretrained("facebook/ijepa_vith14_1k") if is_vision_available() else None + + @slow + def test_inference_no_head(self): + model = IJepaModel.from_pretrained("facebook/ijepa_vith14_1k").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the last hidden state + expected_shape = torch.Size((1, 256, 1280)) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + expected_slice = torch.Tensor( + [[-0.0621, -0.0054, -2.7513], [-0.1952, 0.0909, -3.9536], [0.0942, -0.0331, -1.2833]] + ).to(torch_device) + + torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) + + @slow + @require_accelerate + @require_torch_accelerator + @require_torch_fp16 + def test_inference_fp16(self): + r""" + A small test to make sure that inference work in half precision without any problem. + """ + model = IJepaModel.from_pretrained( + "facebook/ijepa_vith14_1k", + torch_dtype=torch.float16, + device_map="auto", + ) + image_processor = self.default_image_processor + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt") + pixel_values = inputs.pixel_values.to(torch_device) + + # forward pass to make sure inference works in fp16 + with torch.no_grad(): + _ = model(pixel_values) + + @slow + def test_inference_interpolate_pos_encoding(self): + # I-JEPA, similar to ViT models have an `interpolate_pos_encoding` argument in their forward method, + # allowing to interpolate the pre-trained position embeddings in order to use + # the model on higher resolutions. The DINO model by Facebook AI leverages this + # to visualize self-attention on higher resolution images. + model = IJepaModel.from_pretrained("facebook/ijepa_vith14_1k").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt") + pixel_values = inputs.pixel_values.to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(pixel_values, interpolate_pos_encoding=True) + + # verify the logits + expected_shape = torch.Size((1, 256, 1280)) + self.assertEqual(outputs.last_hidden_state.shape, expected_shape) + + expected_slice = torch.tensor( + [[-0.0621, -0.0054, -2.7513], [-0.1952, 0.0909, -3.9536], [0.0942, -0.0331, -1.2833]] + ).to(torch_device) + + torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/imagegpt/__init__.py b/docs/transformers/tests/models/imagegpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/imagegpt/test_image_processing_imagegpt.py b/docs/transformers/tests/models/imagegpt/test_image_processing_imagegpt.py new file mode 100644 index 0000000000000000000000000000000000000000..de29b8e29fbdce95e0aef4197c8d71c9f878d9f0 --- /dev/null +++ b/docs/transformers/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -0,0 +1,287 @@ +# Copyright 2021 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import tempfile +import unittest + +import numpy as np +from datasets import load_dataset + +from transformers import AutoImageProcessor +from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision, slow +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import ImageGPTImageProcessor + + +class ImageGPTImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + ): + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + + def prepare_image_processor_dict(self): + return { + # here we create 2 clusters for the sake of simplicity + "clusters": np.asarray( + [ + [0.8866443634033203, 0.6618829369544983, 0.3891746401786804], + [-0.6042559146881104, -0.02295008860528469, 0.5423797369003296], + ] + ), + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + } + + def expected_output_image_shape(self, images): + return (self.size["height"] * self.size["width"],) + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = ImageGPTImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = ImageGPTImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "clusters")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + def test_image_processor_to_json_string(self): + image_processor = self.image_processing_class(**self.image_processor_dict) + obj = json.loads(image_processor.to_json_string()) + for key, value in self.image_processor_dict.items(): + if key == "clusters": + self.assertTrue(np.array_equal(value, obj[key])) + else: + self.assertEqual(obj[key], value) + + def test_image_processor_to_json_file(self): + image_processor_first = self.image_processing_class(**self.image_processor_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "image_processor.json") + image_processor_first.to_json_file(json_file_path) + image_processor_second = self.image_processing_class.from_json_file(json_file_path).to_dict() + + image_processor_first = image_processor_first.to_dict() + for key, value in image_processor_first.items(): + if key == "clusters": + self.assertTrue(np.array_equal(value, image_processor_second[key])) + else: + self.assertEqual(image_processor_first[key], value) + + def test_image_processor_from_and_save_pretrained(self): + for image_processing_class in self.image_processor_list: + image_processor_first = self.image_processing_class(**self.image_processor_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + image_processor_first.save_pretrained(tmpdirname) + image_processor_second = self.image_processing_class.from_pretrained(tmpdirname).to_dict() + + image_processor_first = image_processor_first.to_dict() + for key, value in image_processor_first.items(): + if key == "clusters": + self.assertTrue(np.array_equal(value, image_processor_second[key])) + else: + self.assertEqual(image_processor_first[key], value) + + def test_image_processor_save_load_with_autoimageprocessor(self): + for image_processing_class in self.image_processor_list: + image_processor_first = image_processing_class(**self.image_processor_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + saved_file = image_processor_first.save_pretrained(tmpdirname)[0] + check_json_file_has_correct_format(saved_file) + + image_processor_second = AutoImageProcessor.from_pretrained(tmpdirname) + + image_processor_first = image_processor_first.to_dict() + image_processor_second = image_processor_second.to_dict() + + for key, value in image_processor_first.items(): + if key == "clusters": + self.assertTrue(np.array_equal(value, image_processor_second[key])) + else: + self.assertEqual(image_processor_first[key], value) + + @unittest.skip(reason="ImageGPT requires clusters at initialization") + def test_init_without_params(self): + pass + + # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input + def test_call_pil(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input + def test_call_numpy(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(encoded_images) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + @unittest.skip(reason="ImageGPT assumes clusters for 3 channels") + def test_call_numpy_4_channels(self): + pass + + # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input + def test_call_pytorch(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").input_ids + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").input_ids + self.assertEqual( + tuple(encoded_images.shape), + (self.image_processor_tester.batch_size, *expected_output_image_shape), + ) + + +def prepare_images(): + # we use revision="refs/pr/1" until the PR is merged + # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1 + dataset = load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1") + + image1 = dataset[4]["image"] + image2 = dataset[5]["image"] + + images = [image1, image2] + + return images + + +@require_vision +@require_torch +class ImageGPTImageProcessorIntegrationTest(unittest.TestCase): + @slow + def test_image(self): + image_processing = ImageGPTImageProcessor.from_pretrained("openai/imagegpt-small") + + images = prepare_images() + + # test non-batched + encoding = image_processing(images[0], return_tensors="pt") + + self.assertIsInstance(encoding.input_ids, torch.LongTensor) + self.assertEqual(encoding.input_ids.shape, (1, 1024)) + + expected_slice = [306, 191, 191] + self.assertEqual(encoding.input_ids[0, :3].tolist(), expected_slice) + + # test batched + encoding = image_processing(images, return_tensors="pt") + + self.assertIsInstance(encoding.input_ids, torch.LongTensor) + self.assertEqual(encoding.input_ids.shape, (2, 1024)) + + expected_slice = [303, 13, 13] + self.assertEqual(encoding.input_ids[1, -3:].tolist(), expected_slice) diff --git a/docs/transformers/tests/models/imagegpt/test_modeling_imagegpt.py b/docs/transformers/tests/models/imagegpt/test_modeling_imagegpt.py new file mode 100644 index 0000000000000000000000000000000000000000..c20d00e7337253296566e408733d523a68012761 --- /dev/null +++ b/docs/transformers/tests/models/imagegpt/test_modeling_imagegpt.py @@ -0,0 +1,360 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import inspect +import unittest + +from transformers import ImageGPTConfig +from transformers.testing_utils import require_torch, require_vision, run_test_using_subprocess, slow, torch_device +from transformers.utils import cached_property, is_torch_available, is_vision_available + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + ImageGPTForCausalImageModeling, + ImageGPTForImageClassification, + ImageGPTModel, + ) + +if is_vision_available(): + from PIL import Image + + from transformers import ImageGPTImageProcessor + + +class ImageGPTModelTester: + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_token_type_ids = use_token_type_ids + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.use_mc_token_ids = use_mc_token_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = None + + def get_large_model_config(self): + return ImageGPTConfig.from_pretrained("imagegpt") + + def prepare_config_and_inputs( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + mc_token_ids = None + if self.use_mc_token_ids: + mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config( + gradient_checkpointing=gradient_checkpointing, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + ) + + head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) + + def get_config( + self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False + ): + return ImageGPTConfig( + vocab_size=self.vocab_size, + n_embd=self.hidden_size, + n_layer=self.num_hidden_layers, + n_head=self.num_attention_heads, + n_inner=self.intermediate_size, + activation_function=self.hidden_act, + resid_pdrop=self.hidden_dropout_prob, + attn_pdrop=self.attention_probs_dropout_prob, + n_positions=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + use_cache=True, + gradient_checkpointing=gradient_checkpointing, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + ) + + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 513 + config.max_position_embeddings = 1024 + return config + + def create_and_check_imagegpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = ImageGPTModel(config=config) + model.to(torch_device) + model.eval() + + result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(len(result.past_key_values), config.n_layer) + + def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): + model = ImageGPTForCausalImageModeling(config) + model.to(torch_device) + model.eval() + + labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1) + result = model(input_ids, token_type_ids=token_type_ids, labels=labels) + self.parent.assertEqual(result.loss.shape, ()) + # ImageGPTForCausalImageModeling doesn't have tied input- and output embeddings + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size - 1)) + + def create_and_check_imagegpt_for_image_classification( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args + ): + config.num_labels = self.num_labels + model = ImageGPTForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "head_mask": head_mask, + } + + return config, inputs_dict + + +@require_torch +class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + (ImageGPTForCausalImageModeling, ImageGPTForImageClassification, ImageGPTModel) if is_torch_available() else () + ) + pipeline_model_mapping = ( + {"image-feature-extraction": ImageGPTModel, "image-classification": ImageGPTForImageClassification} + if is_torch_available() + else {} + ) + test_missing_keys = False + test_torch_exportable = True + + # as ImageGPTForImageClassification isn't included in any auto mapping, we add labels here + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "ImageGPTForImageClassification": + inputs_dict["labels"] = torch.zeros( + self.model_tester.batch_size, dtype=torch.long, device=torch_device + ) + + return inputs_dict + + # we overwrite the _check_scores method of GenerationTesterMixin, as ImageGPTForCausalImageModeling doesn't have tied input- and output embeddings + def _check_scores(self, batch_size, scores, generated_length, config): + expected_shape = (batch_size, config.vocab_size - 1) + self.assertIsInstance(scores, tuple) + self.assertEqual(len(scores), generated_length) + self.assertListEqual([iter_scores.shape for iter_scores in scores], [expected_shape] * len(scores)) + + @run_test_using_subprocess + def test_beam_search_generate_dict_outputs_use_cache(self): + super().test_beam_search_generate_dict_outputs_use_cache() + + def setUp(self): + self.model_tester = ImageGPTModelTester(self) + self.config_tester = ConfigTester(self, config_class=ImageGPTConfig, n_embd=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_imagegpt_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_imagegpt_model(*config_and_inputs) + + def test_imagegpt_causal_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_lm_head_model(*config_and_inputs) + + def test_imagegpt_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_imagegpt_for_image_classification(*config_and_inputs) + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "openai/imagegpt-small" + model = ImageGPTModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["input_ids"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + @unittest.skip(reason="The model doesn't support left padding") # and it's not used enough to be worth fixing :) + def test_left_padding_compatibility(self): + pass + + @unittest.skip(reason="Model inputs don't fit test pattern") # and it's not used enough to be worth fixing :) + def test_past_key_values_format(self): + pass + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class ImageGPTModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return ImageGPTImageProcessor.from_pretrained("openai/imagegpt-small") if is_vision_available() else None + + @slow + def test_inference_causal_lm_head(self): + model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1024, 512)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor( + [[2.3445, 2.6889, 2.7313], [1.0530, 1.2416, 0.5699], [0.2205, 0.7749, 0.3953]] + ).to(torch_device) + + torch.testing.assert_close(outputs.logits[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/informer/__init__.py b/docs/transformers/tests/models/informer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/informer/test_modeling_informer.py b/docs/transformers/tests/models/informer/test_modeling_informer.py new file mode 100644 index 0000000000000000000000000000000000000000..49408197715f060ecf23a2188c3f14b684ad3655 --- /dev/null +++ b/docs/transformers/tests/models/informer/test_modeling_informer.py @@ -0,0 +1,549 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Informer model.""" + +import inspect +import tempfile +import unittest + +import numpy as np +from huggingface_hub import hf_hub_download + +from transformers import is_torch_available +from transformers.testing_utils import is_flaky, require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +TOLERANCE = 1e-4 + +if is_torch_available(): + import torch + + from transformers import InformerConfig, InformerForPrediction, InformerModel + from transformers.models.informer.modeling_informer import ( + InformerDecoder, + InformerEncoder, + InformerSinusoidalPositionalEmbedding, + ) + + +@require_torch +class InformerModelTester: + def __init__( + self, + parent, + batch_size=13, + prediction_length=7, + context_length=14, + cardinality=19, + embedding_dimension=5, + num_time_features=4, + is_training=True, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + lags_sequence=[1, 2, 3, 4, 5], + sampling_factor=10, + distil=False, + ): + self.parent = parent + self.batch_size = batch_size + self.prediction_length = prediction_length + self.context_length = context_length + self.cardinality = cardinality + self.num_time_features = num_time_features + self.lags_sequence = lags_sequence + self.embedding_dimension = embedding_dimension + self.is_training = is_training + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + + self.encoder_seq_length = min( + sampling_factor * np.ceil(np.log1p(context_length)).astype("int").item(), context_length + ) + self.decoder_seq_length = min( + sampling_factor * np.ceil(np.log1p(prediction_length)).astype("int").item(), prediction_length + ) + self.sampling_factor = sampling_factor + self.distil = distil + + def get_config(self): + return InformerConfig( + prediction_length=self.prediction_length, + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + context_length=self.context_length, + lags_sequence=self.lags_sequence, + num_time_features=self.num_time_features, + num_static_categorical_features=1, + num_static_real_features=1, + cardinality=[self.cardinality], + embedding_dimension=[self.embedding_dimension], + sampling_factor=self.sampling_factor, + distil=self.distil, + ) + + def prepare_informer_inputs_dict(self, config): + _past_length = config.context_length + max(config.lags_sequence) + + static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0]) + static_real_features = floats_tensor([self.batch_size, 1]) + + past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features]) + past_values = floats_tensor([self.batch_size, _past_length]) + past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5 + + # decoder inputs + future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features]) + future_values = floats_tensor([self.batch_size, config.prediction_length]) + + inputs_dict = { + "past_values": past_values, + "static_categorical_features": static_categorical_features, + "static_real_features": static_real_features, + "past_time_features": past_time_features, + "past_observed_mask": past_observed_mask, + "future_time_features": future_time_features, + "future_values": future_values, + } + return inputs_dict + + def prepare_config_and_inputs(self): + config = self.get_config() + inputs_dict = self.prepare_informer_inputs_dict(config) + return config, inputs_dict + + def prepare_config_and_inputs_for_common(self): + config, inputs_dict = self.prepare_config_and_inputs() + return config, inputs_dict + + def check_encoder_decoder_model_standalone(self, config, inputs_dict): + model = InformerModel(config=config).to(torch_device).eval() + outputs = model(**inputs_dict) + + encoder_last_hidden_state = outputs.encoder_last_hidden_state + last_hidden_state = outputs.last_hidden_state + + with tempfile.TemporaryDirectory() as tmpdirname: + encoder = model.get_encoder() + encoder.save_pretrained(tmpdirname) + encoder = InformerEncoder.from_pretrained(tmpdirname).to(torch_device) + + transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict) + enc_input = transformer_inputs[:, : config.context_length, ...] + dec_input = transformer_inputs[:, config.context_length :, ...] + + encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0] + + self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3) + + embed_positions = InformerSinusoidalPositionalEmbedding( + config.context_length + config.prediction_length, config.d_model + ).to(torch_device) + embed_positions._init_weight() + self.parent.assertTrue(torch.equal(model.encoder.embed_positions.weight, embed_positions.weight)) + self.parent.assertTrue(torch.equal(model.decoder.embed_positions.weight, embed_positions.weight)) + + with tempfile.TemporaryDirectory() as tmpdirname: + decoder = model.get_decoder() + decoder.save_pretrained(tmpdirname) + decoder = InformerDecoder.from_pretrained(tmpdirname).to(torch_device) + + last_hidden_state_2 = decoder( + inputs_embeds=dec_input, + encoder_hidden_states=encoder_last_hidden_state, + )[0] + + self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3) + + +@require_torch +class InformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (InformerModel, InformerForPrediction) if is_torch_available() else () + pipeline_model_mapping = {"feature-extraction": InformerModel} if is_torch_available() else {} + is_encoder_decoder = True + test_pruning = False + test_head_masking = False + test_missing_keys = False + test_torchscript = False + test_inputs_embeds = False + + def setUp(self): + self.model_tester = InformerModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=InformerConfig, + has_text_modality=False, + prediction_length=self.model_tester.prediction_length, + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_save_load_strict(self): + config, _ = self.model_tester.prepare_config_and_inputs() + for model_class in self.all_model_classes: + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) + self.assertEqual(info["missing_keys"], []) + + def test_encoder_decoder_model_standalone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() + self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + if hasattr(self.model_tester, "encoder_seq_length"): + seq_length = self.model_tester.context_length + if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1: + seq_length = seq_length * self.model_tester.chunk_length + else: + seq_length = self.model_tester.seq_length + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_length, self.model_tester.hidden_size], + ) + + if config.is_encoder_decoder: + hidden_states = outputs.decoder_hidden_states + + self.assertIsInstance(hidden_states, (list, tuple)) + self.assertEqual(len(hidden_states), expected_num_layers) + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "prediction_length", seq_len) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [decoder_seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + @unittest.skip(reason="Informer does not have tokens embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip + def test_model_outputs_equivalence(self): + pass + + @unittest.skip + def test_determinism(self): + pass + + @unittest.skip(reason="randomly selects U keys while calculating attentions") + def test_batching_equivalence(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + # # Input is 'static_categorical_features' not 'input_ids' + def test_model_main_input_name(self): + model_signature = inspect.signature(getattr(InformerModel, "forward")) + # The main input is the name of the argument after `self` + observed_main_input_name = list(model_signature.parameters.keys())[1] + self.assertEqual(InformerModel.main_input_name, observed_main_input_name) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = [ + "past_values", + "past_time_features", + "past_observed_mask", + "static_categorical_features", + "static_real_features", + "future_values", + "future_time_features", + ] + + expected_arg_names.extend( + [ + "future_observed_mask", + "decoder_attention_mask", + "head_mask", + "decoder_head_mask", + "cross_attn_head_mask", + "encoder_outputs", + "past_key_values", + "output_hidden_states", + "output_attentions", + "use_cache", + "return_dict", + ] + if "future_observed_mask" in arg_names + else [ + "decoder_attention_mask", + "head_mask", + "decoder_head_mask", + "cross_attn_head_mask", + "encoder_outputs", + "past_key_values", + "output_hidden_states", + "output_attentions", + "use_cache", + "return_dict", + ] + ) + + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) + context_length = getattr(self.model_tester, "context_length", seq_len) + prediction_length = getattr(self.model_tester, "prediction_length", seq_len) + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, context_length], + ) + out_len = len(outputs) + + correct_outlen = 7 + + if "last_hidden_state" in outputs: + correct_outlen += 1 + + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + if "loss" in outputs: + correct_outlen += 1 + + if "params" in outputs: + correct_outlen += 1 + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, decoder_seq_length, prediction_length], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + decoder_seq_length, + encoder_seq_length, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + 2, len(outputs)) + + self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, context_length], + ) + + @is_flaky() + def test_retain_grad_hidden_states_attentions(self): + super().test_retain_grad_hidden_states_attentions() + + @unittest.skip(reason="Model does not have input embeddings") + def test_model_get_set_embeddings(self): + pass + + +def prepare_batch(filename="train-batch.pt"): + file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset") + batch = torch.load(file, map_location=torch_device, weights_only=True) + return batch + + +@require_torch +@slow +class InformerModelIntegrationTests(unittest.TestCase): + def test_inference_no_head(self): + model = InformerModel.from_pretrained("huggingface/informer-tourism-monthly").to(torch_device) + batch = prepare_batch() + + torch.manual_seed(0) + with torch.no_grad(): + output = model( + past_values=batch["past_values"], + past_time_features=batch["past_time_features"], + past_observed_mask=batch["past_observed_mask"], + static_categorical_features=batch["static_categorical_features"], + future_values=batch["future_values"], + future_time_features=batch["future_time_features"], + ).last_hidden_state + expected_shape = torch.Size((64, model.config.context_length, model.config.d_model)) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor( + [[0.4699, 0.7295, 0.8967], [0.4858, 0.3810, 0.9641], [-0.0233, 0.3608, 1.0303]], + device=torch_device, + ) + torch.testing.assert_close(output[0, :3, :3], expected_slice, rtol=TOLERANCE, atol=TOLERANCE) + + def test_inference_head(self): + model = InformerForPrediction.from_pretrained("huggingface/informer-tourism-monthly").to(torch_device) + batch = prepare_batch("val-batch.pt") + + torch.manual_seed(0) + with torch.no_grad(): + output = model( + past_values=batch["past_values"], + past_time_features=batch["past_time_features"], + past_observed_mask=batch["past_observed_mask"], + static_categorical_features=batch["static_categorical_features"], + future_time_features=batch["future_time_features"], + ).encoder_last_hidden_state + + # encoder distils the context length to 1/8th of the original length + expected_shape = torch.Size((64, model.config.context_length // 8, model.config.d_model)) + self.assertEqual(output.shape, expected_shape) + + expected_slice = torch.tensor( + [[0.4170, 0.9067, 0.8153], [0.3004, 0.7574, 0.7066], [0.6803, -0.6323, 1.2802]], device=torch_device + ) + torch.testing.assert_close(output[0, :3, :3], expected_slice, rtol=TOLERANCE, atol=TOLERANCE) + + def test_seq_to_seq_generation(self): + model = InformerForPrediction.from_pretrained("huggingface/informer-tourism-monthly").to(torch_device) + batch = prepare_batch("val-batch.pt") + + torch.manual_seed(0) + with torch.no_grad(): + outputs = model.generate( + static_categorical_features=batch["static_categorical_features"], + past_time_features=batch["past_time_features"], + past_values=batch["past_values"], + future_time_features=batch["future_time_features"], + past_observed_mask=batch["past_observed_mask"], + ) + expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.prediction_length)) + self.assertEqual(outputs.sequences.shape, expected_shape) + + expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device) + mean_prediction = outputs.sequences.mean(dim=1) + torch.testing.assert_close(mean_prediction[0, -3:], expected_slice, rtol=1e-1, atol=1e-1) diff --git a/docs/transformers/tests/models/instructblip/__init__.py b/docs/transformers/tests/models/instructblip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/instructblip/test_modeling_instructblip.py b/docs/transformers/tests/models/instructblip/test_modeling_instructblip.py new file mode 100644 index 0000000000000000000000000000000000000000..c795de7a0b51a5cd2ef7096b57c60fedd46ae78c --- /dev/null +++ b/docs/transformers/tests/models/instructblip/test_modeling_instructblip.py @@ -0,0 +1,842 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch InstructBLIP model.""" + +import inspect +import tempfile +import unittest + +import numpy as np +import pytest +import requests +from parameterized import parameterized + +from transformers import ( + CONFIG_MAPPING, + InstructBlipConfig, + InstructBlipProcessor, + InstructBlipQFormerConfig, + InstructBlipVisionConfig, +) +from transformers.testing_utils import ( + require_accelerate, + require_bitsandbytes, + require_torch, + require_torch_sdpa, + require_vision, + slow, + torch_device, +) +from transformers.utils import is_torch_available, is_vision_available + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + floats_tensor, + ids_tensor, + random_attention_mask, +) + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import InstructBlipForConditionalGeneration, InstructBlipVisionModel + + +if is_vision_available(): + from PIL import Image + + +class InstructBlipVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=1e-10, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + # in case of a vision transformer, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return InstructBlipVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = InstructBlipVisionModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class InstructBlipVisionModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as InstructBLIP's vision encoder does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (InstructBlipVisionModel,) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = InstructBlipVisionModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=InstructBlipConfig, + has_text_modality=False, + common_properties=["num_query_tokens", "image_token_index"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="InstructBLIP's vision encoder does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="InstructBlipVisionModel is an internal building block, doesn't support standalone training") + def test_training(self): + pass + + @unittest.skip(reason="InstructBlipVisionModel is an internal building block, doesn't support standalone training") + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "Salesforce/instructblip-flan-t5-xl" + model = InstructBlipVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class InstructBlipQFormerModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + bos_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + self.bos_token_id = bos_token_id + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + qformer_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + qformer_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + config = self.get_config() + + return config, input_ids, input_mask, qformer_input_ids, qformer_attention_mask + + def get_config(self): + return InstructBlipQFormerConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + bos_token_id=self.bos_token_id, + ) + + +# this class is based on `OPTModelTester` found in tests/models/opt/test_modeling_opt.py +class InstructBlipTextModelDecoderOnlyTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_labels=False, + vocab_size=99, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=100, + eos_token_id=2, + pad_token_id=1, + bos_token_id=0, + embed_dim=16, + num_labels=3, + word_embed_proj_dim=16, + type_sequence_label_size=2, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.embed_dim = embed_dim + self.num_labels = num_labels + self.type_sequence_label_size = type_sequence_label_size + self.word_embed_proj_dim = word_embed_proj_dim + self.is_encoder_decoder = False + + def prepare_config_and_inputs(self): + config = self.get_config() + + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3) + input_ids[:, -1] = self.eos_token_id # Eos Token + + attention_mask = input_ids.ne(self.pad_token_id) + + return config, input_ids, attention_mask + + def get_config(self): + return CONFIG_MAPPING["opt"]( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + eos_token_id=self.eos_token_id, + bos_token_id=self.bos_token_id, + pad_token_id=self.pad_token_id, + embed_dim=self.embed_dim, + is_encoder_decoder=False, + word_embed_proj_dim=self.word_embed_proj_dim, + ) + + +# this model tester uses a decoder-only language model (OPT) +class InstructBlipForConditionalGenerationDecoderOnlyModelTester: + def __init__( + self, + parent, + vision_kwargs=None, + qformer_kwargs=None, + text_kwargs=None, + is_training=True, + num_query_tokens=10, + image_token_index=4, + ): + if vision_kwargs is None: + vision_kwargs = {} + if qformer_kwargs is None: + qformer_kwargs = {} + if text_kwargs is None: + text_kwargs = {} + + self.parent = parent + self.vision_model_tester = InstructBlipVisionModelTester(parent, **vision_kwargs) + self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs) + self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test + self.seq_length = self.text_model_tester.seq_length + num_query_tokens # need seq_length for common tests + self.is_training = is_training + self.num_query_tokens = num_query_tokens + self.image_token_index = image_token_index + + def prepare_config_and_inputs(self): + _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs() + _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + + config = self.get_config() + vision_tokens = ( + torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype) + * self.image_token_index + ) + input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id + input_ids = torch.cat([vision_tokens, input_ids], dim=-1) + vision_attention_mask = torch.ones_like(vision_tokens) + attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1) + + return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values + + def get_config(self): + return InstructBlipConfig.from_vision_qformer_text_configs( + vision_config=self.vision_model_tester.get_config(), + qformer_config=self.qformer_model_tester.get_config(), + text_config=self.text_model_tester.get_config(), + num_query_tokens=self.num_query_tokens, + image_token_index=self.image_token_index, + ) + + def create_and_check_for_conditional_generation( + self, config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values + ): + model = InstructBlipForConditionalGeneration(config).to(torch_device).eval() + with torch.no_grad(): + result = model( + pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + qformer_input_ids=qformer_input_ids, + qformer_attention_mask=qformer_attention_mask, + ) + + expected_seq_length = self.num_query_tokens + self.text_model_tester.seq_length + self.parent.assertEqual( + result.logits.shape, + (self.vision_model_tester.batch_size, expected_seq_length, self.text_model_tester.vocab_size), + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + "qformer_input_ids": qformer_input_ids, + "qformer_attention_mask": qformer_attention_mask, + "labels": input_ids, + } + return config, inputs_dict + + +@require_torch +class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration} + fx_compatible = False + test_head_masking = False + test_pruning = False + test_resize_embeddings = True + test_attention_outputs = False + test_torchscript = False + _is_composite = True + + def setUp(self): + self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=InstructBlipConfig, + has_text_modality=False, + common_properties=["num_query_tokens", "image_token_index"], + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_for_conditional_generation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs) + + @unittest.skip(reason="Hidden_states is tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="InstructBlipForConditionalGeneration doesn't support inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Tied weights are tested in individual model tests") + def test_tied_weights_keys(self): + pass + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="InstructBlipModel does not have input/output embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip( + "InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present" + ) + def test_generate_from_inputs_embeds_with_static_cache(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_load_vision_qformer_text_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Save InstructBlipConfig and check if we can load InstructBlipVisionConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + vision_config = InstructBlipVisionConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict()) + + # Save InstructBlipConfig and check if we can load InstructBlipQFormerConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + qformer_config = InstructBlipQFormerConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict()) + + @slow + def test_model_from_pretrained(self): + model_name = "Salesforce/instructblip-flan-t5-xl" + model = InstructBlipForConditionalGeneration.from_pretrained(model_name) + self.assertIsNotNone(model) + + # overwrite because InstructBLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format + def _check_generate_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): + use_cache = True # force this to be True in case False is passed + super()._check_generate_outputs( + output, config, use_cache=use_cache, num_return_sequences=num_return_sequences, num_beams=num_beams + ) + + # overwrite because InstructBLIP cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present + @pytest.mark.generate + def test_left_padding_compatibility(self): + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + # First, filter out models that don't support left padding + # - The model must have generative capabilities + if len(self.all_generative_model_classes) == 0: + self.skipTest(reason="No generative architecture available for this model.") + + # - The model must support padding + if not self.has_attentions: + self.skipTest(reason="This model doesn't support padding.") + + # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) + decoder_only_classes = [] + for model_class in self.all_generative_model_classes: + config, _ = self.prepare_config_and_inputs_for_generate() + if config.is_encoder_decoder: + continue + else: + decoder_only_classes.append(model_class) + if len(decoder_only_classes) == 0: + self.skipTest(reason="No decoder-only architecture available for this model.") + + # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't + # added support for it yet. We skip these models for now. + has_encoder_attributes = any( + attr_name + for attr_name in config.to_dict().keys() + if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" + ) + if has_encoder_attributes: + self.skipTest( + reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." + ) + + # Then, test left-padding + def _prepare_model_kwargs(input_ids, attention_mask, signature): + model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} + if "position_ids" in signature: + position_ids = torch.cumsum(attention_mask, dim=-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + model_kwargs["position_ids"] = position_ids + if "cache_position" in signature: + cache_position = torch.arange(input_ids.shape[-1], device=torch_device) + model_kwargs["cache_position"] = cache_position + return model_kwargs + + for model_class in decoder_only_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] + attention_mask = inputs_dict.get("attention_mask") + pixel_values = inputs_dict["pixel_values"] + qformer_input_ids = inputs_dict["qformer_input_ids"] + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + model = model_class(config).to(torch_device).eval() + signature = inspect.signature(model.forward).parameters.keys() + + # no cache as some models require special cache classes to be init outside forward + model.generation_config.use_cache = False + + # Without padding + model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) + next_logits_wo_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # With left-padding (length 32) + # can hardcode pad_token to be 0 as we'll do attn masking anyway + pad_token_id = ( + config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 + ) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) + next_logits_with_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # They should result in very similar logits + torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) + + @unittest.skip( + "InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present" + ) + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): + pass + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + """ + Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model. + This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention". + In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model + is loaded, because we manually replicate requested attn implementation on each sub-config when loading. + See https://github.com/huggingface/transformers/pull/32238 for more info + + The test tries to cover most general cases of composite models, VLMs with vision and text configs. Any model + that has a different set of sub-configs has to overwrite this test. + """ + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" + vision_attn = "sdpa" if model.vision_model._supports_sdpa else "eager" + qformer_attn = "sdpa" if model.qformer._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + self.assertTrue(model.language_model.config._attn_implementation == text_attn) + self.assertTrue(model.vision_model.config._attn_implementation == vision_attn) + self.assertTrue(model.qformer.config._attn_implementation == qformer_attn) + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.qformer.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and any( + module_attn == "sdpa" for module_attn in [text_attn, vision_attn, qformer_attn] + ): + raise ValueError("The SDPA model should have SDPA attention layers") + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg" + image = Image.open(requests.get(url, stream=True).raw) + return image + + +@require_vision +@require_torch +@slow +class InstructBlipModelIntegrationTest(unittest.TestCase): + @require_bitsandbytes + @require_accelerate + def test_inference_vicuna_7b(self): + processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") + model = InstructBlipForConditionalGeneration.from_pretrained( + "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True + ) + + url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + prompt = "What is unusual about this image?" + inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) + + # verify generation + outputs = model.generate(**inputs, max_new_tokens=30) + generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() + + expected_outputs = [32001] * 32 + [2, 1724, 338, 22910, 1048, 445, 1967, 29973, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 19500, 373, 263, 19587, 4272, 11952, 29889] # fmt: off + + self.assertEqual(outputs[0].tolist(), expected_outputs) + self.assertEqual( + generated_text, + "What is unusual about this image? The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving on a busy city street.", + ) + + def test_inference_flant5_xl(self): + processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl") + model = InstructBlipForConditionalGeneration.from_pretrained( + "Salesforce/instructblip-flan-t5-xl", + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + ).to(torch_device) + + url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + prompt = "What is unusual about this image?" + inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device) + + for k, v in inputs.items(): + if torch.is_floating_point(v): + inputs[k] = v.to(torch.bfloat16) + + outputs = model.generate( + **inputs, + do_sample=False, + num_beams=5, + max_length=256, + min_length=1, + repetition_penalty=1.5, + length_penalty=1.0, + temperature=1, + ) + generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] + + expected_outputs = [0, 37, 7225, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 46, 3575, 53, 1476, 5223, 12, 34, 6, 15495, 24, 3, 88, 19, 692, 112, 293, 10428, 44, 234, 1066, 145, 338, 3, 9, 50, 1106, 3522, 144, 42, 2192, 7919, 31, 7, 5, 37, 1023, 92, 1267, 3, 9, 381, 13, 119, 3203, 16, 8, 2458, 6, 379, 14264, 6, 9256, 7, 6, 11, 11718, 7, 5, 1] # fmt: skip + + self.assertEqual(outputs[0].tolist(), expected_outputs) + self.assertEqual( + generated_text, + "The unusual image depicts a man ironing clothes on the back of a yellow van in the middle of a busy city street. The man is wearing a yellow shirt with an ironing board attached to it, suggesting that he is doing his own laundry at home rather than using a laundromat or dry cleaner's. The image also shows a number of other vehicles in the background, including buses, taxis, and motorcycles.", + ) + + def test_inference_interpolate_pos_encoding(self): + processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl") + model = InstructBlipForConditionalGeneration.from_pretrained( + "Salesforce/instructblip-flan-t5-xl", + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + ).to(torch_device) + processor.image_processor.size = {"height": 500, "width": 500} + + image = prepare_img() + prompt = "What's in the image?" + inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device) + + predictions = model.generate(**inputs, interpolate_pos_encoding=True) + generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() + + self.assertEqual( + predictions[0].tolist(), [0, 37, 1023, 753, 3, 9, 2335, 3823, 30, 8, 2608, 28, 3, 9, 1782, 5, 1] + ) + self.assertEqual(generated_text, "The image features a woman sitting on the beach with a dog.") + + def test_expansion_in_processing(self): + processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl") + model = InstructBlipForConditionalGeneration.from_pretrained( + "Salesforce/instructblip-flan-t5-xl", + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + ).to(torch_device) + + image = prepare_img() + prompt = "What's in the image?" + + # Make sure we will go the legacy path by setting these args to None + processor.num_query_tokens = None + model.config.image_token_index = None + inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16) + + predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15) + generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() + + # Add args to the config to trigger new logic when inputs are expanded in processing file + processor.num_query_tokens = model.config.num_query_tokens + processor.tokenizer.add_special_tokens({"additional_special_tokens": [""]}) + model.config.image_token_index = len(processor.tokenizer) - 2 + model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64) + + # Generate again with new inputs + inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16) + predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15) + generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip() + + self.assertTrue(generated_text_expanded == generated_text) diff --git a/docs/transformers/tests/models/instructblip/test_processor_instructblip.py b/docs/transformers/tests/models/instructblip/test_processor_instructblip.py new file mode 100644 index 0000000000000000000000000000000000000000..6675390e0b2f88237958b7f8817bbb0f93e5a85e --- /dev/null +++ b/docs/transformers/tests/models/instructblip/test_processor_instructblip.py @@ -0,0 +1,184 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import shutil +import tempfile +import unittest + +import pytest + +from transformers.testing_utils import require_vision +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import ( + AutoProcessor, + BertTokenizerFast, + BlipImageProcessor, + GPT2Tokenizer, + InstructBlipProcessor, + PreTrainedTokenizerFast, + ) + + +@require_vision +class InstructBlipProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = InstructBlipProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + image_processor = BlipImageProcessor() + tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model") + qformer_tokenizer = BertTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-bert") + + processor = InstructBlipProcessor(image_processor, tokenizer, qformer_tokenizer) + + processor.save_pretrained(cls.tmpdirname) + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + def get_qformer_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).qformer_tokenizer + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + def test_save_load_pretrained_additional_features(self): + processor = InstructBlipProcessor( + tokenizer=self.get_tokenizer(), + image_processor=self.get_image_processor(), + qformer_tokenizer=self.get_qformer_tokenizer(), + ) + with tempfile.TemporaryDirectory() as tmpdir: + processor.save_pretrained(tmpdir) + + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + + processor = InstructBlipProcessor.from_pretrained( + tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, BlipImageProcessor) + self.assertIsInstance(processor.qformer_tokenizer, BertTokenizerFast) + + def test_image_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + qformer_tokenizer = self.get_qformer_tokenizer() + + processor = InstructBlipProcessor( + tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer + ) + + image_input = self.prepare_image_inputs() + + input_feat_extract = image_processor(image_input, return_tensors="np") + input_processor = processor(images=image_input, return_tensors="np") + + for key in input_feat_extract.keys(): + self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) + + def test_tokenizer(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + qformer_tokenizer = self.get_qformer_tokenizer() + + processor = InstructBlipProcessor( + tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer + ) + + input_str = ["lower newer"] + + encoded_processor = processor(text=input_str) + + encoded_tokens = tokenizer(input_str, return_token_type_ids=False) + encoded_tokens_qformer = qformer_tokenizer(input_str, return_token_type_ids=False) + + for key in encoded_tokens.keys(): + self.assertListEqual(encoded_tokens[key], encoded_processor[key]) + + for key in encoded_tokens_qformer.keys(): + self.assertListEqual(encoded_tokens_qformer[key], encoded_processor["qformer_" + key]) + + def test_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + qformer_tokenizer = self.get_qformer_tokenizer() + + processor = InstructBlipProcessor( + tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer + ) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual( + list(inputs.keys()), + ["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"], + ) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + def test_tokenizer_decode(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + qformer_tokenizer = self.get_qformer_tokenizer() + + processor = InstructBlipProcessor( + tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer + ) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + def test_model_input_names(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + qformer_tokenizer = self.get_qformer_tokenizer() + + processor = InstructBlipProcessor( + tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer + ) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual( + list(inputs.keys()), + ["input_ids", "attention_mask", "qformer_input_ids", "qformer_attention_mask", "pixel_values"], + ) diff --git a/docs/transformers/tests/models/instructblipvideo/__init__.py b/docs/transformers/tests/models/instructblipvideo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py b/docs/transformers/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py new file mode 100644 index 0000000000000000000000000000000000000000..d6e990085540e921f839f7da29c0e62454064a92 --- /dev/null +++ b/docs/transformers/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py @@ -0,0 +1,190 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import InstructBlipVideoImageProcessor + + +class InstructBlipVideoProcessingTester: + def __init__( + self, + parent, + batch_size=5, + num_channels=3, + image_size=24, + min_resolution=30, + max_resolution=80, + do_resize=True, + size=None, + do_normalize=True, + image_mean=OPENAI_CLIP_MEAN, + image_std=OPENAI_CLIP_STD, + do_convert_rgb=True, + frames=4, + ): + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + self.frames = frames + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + def expected_output_image_shape(self, images): + return self.frames, self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + images = prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + # let's simply copy the frames to fake a long video-clip + if numpify or torchify: + videos = [] + for image in images: + if numpify: + video = image[None, ...].repeat(self.frames, 0) + else: + video = image[None, ...].repeat(self.frames, 1, 1, 1) + videos.append(video) + else: + videos = [] + for pil_image in images: + videos.append([pil_image] * self.frames) + + return videos + + +@require_torch +@require_vision +class InstructBlipVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = InstructBlipVideoImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = InstructBlipVideoProcessingTester(self) + + @property + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + def test_call_pil(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) + for video in video_inputs: + self.assertIsInstance(video[0], Image.Image) + + # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!) + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values + expected_output_video_shape = (1, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values + expected_output_video_shape = (5, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + def test_call_numpy(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True) + for video in video_inputs: + self.assertIsInstance(video, np.ndarray) + + # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!) + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values + expected_output_video_shape = (1, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values + expected_output_video_shape = (5, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + def test_call_pytorch(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True) + for video in video_inputs: + self.assertIsInstance(video, torch.Tensor) + + # Test not batched input + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values + expected_output_video_shape = (1, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values + expected_output_video_shape = (5, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) diff --git a/docs/transformers/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/docs/transformers/tests/models/instructblipvideo/test_modeling_instructblipvideo.py new file mode 100644 index 0000000000000000000000000000000000000000..a4b7f25e34746084a3d0b9db0debfa8846c9f8fe --- /dev/null +++ b/docs/transformers/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -0,0 +1,795 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch InstructBlipVideo model.""" + +import inspect +import tempfile +import unittest + +import numpy as np +import pytest +from huggingface_hub import hf_hub_download +from parameterized import parameterized + +from transformers import ( + CONFIG_MAPPING, + InstructBlipVideoConfig, + InstructBlipVideoProcessor, + InstructBlipVideoQFormerConfig, + InstructBlipVideoVisionConfig, +) +from transformers.testing_utils import ( + require_accelerate, + require_bitsandbytes, + require_torch, + require_torch_sdpa, + require_vision, + slow, + torch_device, +) +from transformers.utils import is_torch_available + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + floats_tensor, + ids_tensor, + random_attention_mask, +) + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import InstructBlipVideoForConditionalGeneration, InstructBlipVideoVisionModel + + +class InstructBlipVideoVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + frames=4, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=1e-10, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.frames = frames + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + # in case of a vision transformer, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [self.batch_size * self.frames, self.num_channels, self.image_size, self.image_size] + ) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return InstructBlipVideoVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = InstructBlipVideoVisionModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size * self.frames, num_patches + 1, self.hidden_size) + ) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size * self.frames, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class InstructBlipVideoVisionModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as InstructBlipVideo's vision encoder does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (InstructBlipVideoVisionModel,) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = InstructBlipVideoVisionModelTester(self) + common_properties = ["num_query_tokens", "video_token_index"] + self.config_tester = ConfigTester( + self, config_class=InstructBlipVideoConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="InstructBlipVideo's vision encoder does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="InstructBlipVideo's vision encoder is an nn.Embeddings layer") + def test_model_get_set_embeddings(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip( + reason="InstructBlipVideoVisionModel is an internal building block, doesn't support standalone training" + ) + def test_training(self): + pass + + @unittest.skip( + reason="InstructBlipVideoVisionModel is an internal building block, doesn't support standalone training" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "Salesforce/instructblip-vicuna-7b" + model = InstructBlipVideoVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class InstructBlipVideoQFormerModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + bos_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + self.bos_token_id = bos_token_id + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + qformer_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + qformer_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + config = self.get_config() + + return config, input_ids, input_mask, qformer_input_ids, qformer_attention_mask + + def get_config(self): + return InstructBlipVideoQFormerConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + bos_token_id=self.bos_token_id, + ) + + +# this class is based on `OPTModelTester` found in tests/models/opt/test_modeling_opt.py +class InstructBlipVideoTextModelDecoderOnlyTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_labels=False, + vocab_size=99, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=100, + eos_token_id=2, + pad_token_id=1, + bos_token_id=0, + embed_dim=16, + num_labels=3, + word_embed_proj_dim=16, + type_sequence_label_size=2, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.embed_dim = embed_dim + self.num_labels = num_labels + self.type_sequence_label_size = type_sequence_label_size + self.word_embed_proj_dim = word_embed_proj_dim + self.is_encoder_decoder = False + + def prepare_config_and_inputs(self): + config = self.get_config() + + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3) + input_ids[:, -1] = self.eos_token_id # Eos Token + + attention_mask = input_ids.ne(self.pad_token_id) + + return config, input_ids, attention_mask + + def get_config(self): + return CONFIG_MAPPING["opt"]( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + eos_token_id=self.eos_token_id, + bos_token_id=self.bos_token_id, + pad_token_id=self.pad_token_id, + embed_dim=self.embed_dim, + is_encoder_decoder=False, + word_embed_proj_dim=self.word_embed_proj_dim, + ) + + +# this model tester uses a decoder-only language model (OPT) +class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester: + def __init__( + self, + parent, + vision_kwargs=None, + qformer_kwargs=None, + text_kwargs=None, + is_training=True, + num_query_tokens=10, + video_token_index=4, + ): + if vision_kwargs is None: + vision_kwargs = {} + if qformer_kwargs is None: + qformer_kwargs = {} + if text_kwargs is None: + text_kwargs = {} + + self.parent = parent + self.vision_model_tester = InstructBlipVideoVisionModelTester(parent, **vision_kwargs) + self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs) + self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test + self.frames = self.vision_model_tester.frames + # need seq_length for common tests + self.seq_length = self.text_model_tester.seq_length + (num_query_tokens * self.frames) + self.is_training = is_training + self.num_query_tokens = num_query_tokens + self.video_token_index = video_token_index + + def prepare_config_and_inputs(self): + _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs() + _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + _, c, h, w = pixel_values.shape + pixel_values = pixel_values.reshape(-1, self.frames, c, h, w) + + vision_tokens = ( + torch.ones( + (input_ids.shape[0], self.num_query_tokens * self.frames), device=torch_device, dtype=input_ids.dtype + ) + * self.video_token_index + ) + input_ids[input_ids == self.video_token_index] = self.text_model_tester.pad_token_id + input_ids = torch.cat([vision_tokens, input_ids], dim=-1) + vision_attention_mask = torch.ones_like(vision_tokens) + attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1) + + config = self.get_config() + + return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values + + def get_config(self): + return InstructBlipVideoConfig.from_vision_qformer_text_configs( + vision_config=self.vision_model_tester.get_config(), + qformer_config=self.qformer_model_tester.get_config(), + text_config=self.text_model_tester.get_config(), + num_query_tokens=self.num_query_tokens, + video_token_index=self.video_token_index, + ) + + def create_and_check_for_conditional_generation( + self, config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values + ): + model = InstructBlipVideoForConditionalGeneration(config).to(torch_device).eval() + with torch.no_grad(): + result = model( + pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + qformer_input_ids=qformer_input_ids, + qformer_attention_mask=qformer_attention_mask, + ) + + expected_seq_length = ( + self.num_query_tokens * self.vision_model_tester.frames + ) + self.text_model_tester.seq_length + self.parent.assertEqual( + result.logits.shape, + (self.vision_model_tester.batch_size, expected_seq_length, self.text_model_tester.vocab_size), + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + "qformer_input_ids": qformer_input_ids, + "qformer_attention_mask": qformer_attention_mask, + "labels": input_ids, + } + return config, inputs_dict + + +@require_torch +class InstructBlipVideoForConditionalGenerationDecoderOnlyTest( + ModelTesterMixin, GenerationTesterMixin, unittest.TestCase +): + all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else () + fx_compatible = False + test_head_masking = False + test_pruning = False + test_resize_embeddings = True + test_attention_outputs = False + test_torchscript = False + _is_composite = True + + def setUp(self): + self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self) + common_properties = ["num_query_tokens", "video_token_index"] + self.config_tester = ConfigTester( + self, config_class=InstructBlipVideoConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_for_conditional_generation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Hidden_states is tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="InstructBlipVideoForConditionalGeneration doesn't support inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Tied weights are tested in individual model tests") + def test_tied_weights_keys(self): + pass + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="InstructBlipVideoModel does not have input/output embeddings") + def test_model_common_attributes(self): + pass + + @unittest.skip( + "InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present" + ) + def test_generate_from_inputs_embeds_with_static_cache(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_load_vision_qformer_text_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Save InstructBlipVideoConfig and check if we can load InstructBlipVideoVisionConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + vision_config = InstructBlipVideoVisionConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict()) + + # Save InstructBlipVideoConfig and check if we can load InstructBlipVideoQFormerConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + qformer_config = InstructBlipVideoQFormerConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict()) + + @slow + def test_model_from_pretrained(self): + model_name = "Salesforce/instructblip-vicuna-7b" + model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name) + self.assertIsNotNone(model) + + # overwrite because InstructBLIPVideo internally calls LM.generate() with embeds thus it cannot operate in no cache format + def _check_generate_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): + use_cache = True # force this to be True in case False is passed + super()._check_generate_outputs( + output, config, use_cache=use_cache, num_return_sequences=num_return_sequences, num_beams=num_beams + ) + + # overwrite because InstructBLIPVideo cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present + @pytest.mark.generate + def test_left_padding_compatibility(self): + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + # First, filter out models that don't support left padding + # - The model must have generative capabilities + if len(self.all_generative_model_classes) == 0: + self.skipTest(reason="No generative architecture available for this model.") + + # - The model must support padding + if not self.has_attentions: + self.skipTest(reason="This model doesn't support padding.") + + # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) + decoder_only_classes = [] + for model_class in self.all_generative_model_classes: + config, _ = self.prepare_config_and_inputs_for_generate() + if config.is_encoder_decoder: + continue + else: + decoder_only_classes.append(model_class) + if len(decoder_only_classes) == 0: + self.skipTest(reason="No decoder-only architecture available for this model.") + + # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't + # added support for it yet. We skip these models for now. + has_encoder_attributes = any( + attr_name + for attr_name in config.to_dict().keys() + if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" + ) + if has_encoder_attributes: + self.skipTest( + reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." + ) + + # Then, test left-padding + def _prepare_model_kwargs(input_ids, attention_mask, signature): + model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} + if "position_ids" in signature: + position_ids = torch.cumsum(attention_mask, dim=-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + model_kwargs["position_ids"] = position_ids + if "cache_position" in signature: + cache_position = torch.arange(input_ids.shape[-1], device=torch_device) + model_kwargs["cache_position"] = cache_position + return model_kwargs + + for model_class in decoder_only_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] + attention_mask = inputs_dict.get("attention_mask") + pixel_values = inputs_dict["pixel_values"] + qformer_input_ids = inputs_dict["qformer_input_ids"] + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + model = model_class(config).to(torch_device).eval() + signature = inspect.signature(model.forward).parameters.keys() + + # no cache as some models require special cache classes to be init outside forward + model.generation_config.use_cache = False + + # Without padding + model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) + next_logits_wo_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # With left-padding (length 32) + # can hardcode pad_token to be 0 as we'll do attn masking anyway + pad_token_id = ( + config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 + ) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) + next_logits_with_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # They should result in very similar logits + torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) + + @unittest.skip( + "InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present" + ) + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): + pass + + @require_torch_sdpa + def test_sdpa_can_dispatch_composite_models(self): + """ + Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model. + This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention". + In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model + is loaded, because we manually replicate requested attn implementation on each sub-config when loading. + See https://github.com/huggingface/transformers/pull/32238 for more info + + The test tries to cover most general cases of composite models, VLMs with vision and text configs. Any model + that has a different set of sub-configs has to overwrite this test. + """ + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + if not self._is_composite: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname) + model_sdpa = model_sdpa.eval().to(torch_device) + + text_attn = "sdpa" if model.language_model._supports_sdpa else "eager" + vision_attn = "sdpa" if model.vision_model._supports_sdpa else "eager" + qformer_attn = "sdpa" if model.qformer._supports_sdpa else "eager" + + # `None` as it is the requested one which will be assigned to each sub-config + # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present) + self.assertTrue(model.language_model.config._attn_implementation == text_attn) + self.assertTrue(model.vision_model.config._attn_implementation == vision_attn) + self.assertTrue(model.qformer.config._attn_implementation == qformer_attn) + + model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager") + model_eager = model_eager.eval().to(torch_device) + self.assertTrue(model_eager.config._attn_implementation == "eager") + self.assertTrue(model_eager.language_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager") + self.assertTrue(model_eager.qformer.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + class_name = submodule.__class__.__name__ + if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: + has_sdpa = True + break + if not has_sdpa and any( + module_attn == "sdpa" for module_attn in [text_attn, vision_attn, qformer_attn] + ): + raise ValueError("The SDPA model should have SDPA attention layers") + + +# We will verify our results on an image of cute cats +def prepare_video(): + video_file = hf_hub_download( + repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" + ) + video = np.load(video_file)[::2] # sample every 2nd frame to get 4 frames total + return video + + +@require_vision +@require_torch +@require_bitsandbytes +@require_accelerate +@slow +class InstructBlipVideoModelIntegrationTest(unittest.TestCase): + def test_inference_vicuna_7b(self): + processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") + model = InstructBlipVideoForConditionalGeneration.from_pretrained( + "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True + ) + + clip = prepare_video() + prompt = "Explain what is happening in this short video." + inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, torch.float16) + + # verify generation + outputs = model.generate(**inputs, max_new_tokens=30) + generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() + self.assertEqual( + generated_text, + "Explain what is happening in this short video. a baby girl wearing glasses is reading a book on the bed 1080p", + ) + + def test_expansion_in_processing(self): + processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") + model = InstructBlipVideoForConditionalGeneration.from_pretrained( + "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True + ) + + clip = prepare_video() + prompt = "Explain what is happening in this short video." + + # Make sure we will go the legacy path by setting these args to None + processor.num_query_tokens = None + model.config.video_token_index = None + inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16) + + predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15) + generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() + + # Add args to the config to trigger new logic when inputs are expanded in processing file + processor.num_query_tokens = model.config.num_query_tokens + processor.tokenizer.add_special_tokens({"additional_special_tokens": [" 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer" # fmt: skip + decoding = processor.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # batched + questions = ["How old is he?", "what's the time"] + input_processor = processor( + images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt" + ) + + # verify keys + expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + # this was obtained with Tesseract 4.1.1 + expected_decoding = " what's the time 7 ITC Limited REPORT AND ACCOUNTS 2013 ITC" + decoding = processor.decode(input_processor.input_ids[1].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify bbox + expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [372, 59, 407, 66], [74, 136, 161, 158], [74, 136, 161, 158], [0, 0, 0, 0]] # fmt: skip + self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox) + + @slow + def test_processor_case_5(self): + # case 5: visual question answering (inference), apply_ocr=False + + image_processor = LayoutLMv3ImageProcessor(apply_ocr=False) + tokenizers = self.get_tokenizers + images = self.get_images + + for tokenizer in tokenizers: + processor = LayoutLMv3Processor(image_processor=image_processor, tokenizer=tokenizer) + + # not batched + question = "What's his name?" + words = ["hello", "world"] + boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] + input_processor = processor(images[0], question, words, boxes, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " What's his name? hello world" + decoding = processor.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # batched + questions = ["How old is he?", "what's the time"] + words = [["hello", "world"], ["my", "name", "is", "niels"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]] + input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " How old is he? hello world" + decoding = processor.decode(input_processor.input_ids[0].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + expected_decoding = " what's the time my name is niels" + decoding = processor.decode(input_processor.input_ids[1].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify bbox + expected_bbox = [[6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [0, 0, 0, 0]] + self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox) diff --git a/docs/transformers/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/docs/transformers/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py new file mode 100644 index 0000000000000000000000000000000000000000..a1989d814add8b2e5263c0d34547234be9b0db7c --- /dev/null +++ b/docs/transformers/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -0,0 +1,2426 @@ +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import json +import os +import re +import shutil +import tempfile +import unittest +from functools import lru_cache + +from parameterized import parameterized + +from transformers import ( + AddedToken, + LayoutLMv3TokenizerFast, + SpecialTokensMixin, + is_tf_available, + is_torch_available, + logging, +) +from transformers.models.layoutlmv3.tokenization_layoutlmv3 import VOCAB_FILES_NAMES, LayoutLMv3Tokenizer +from transformers.testing_utils import ( + require_pandas, + require_tf, + require_tokenizers, + require_torch, + slow, +) + +from ...test_tokenization_common import ( + SMALL_TRAINING_CORPUS, + TokenizerTesterMixin, + merge_model_tokenizer_mappings, + use_cache_if_possible, +) + + +logger = logging.get_logger(__name__) + + +@require_tokenizers +@require_pandas +class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/layoutlmv3-base" + tokenizer_class = LayoutLMv3Tokenizer + rust_tokenizer_class = LayoutLMv3TokenizerFast + test_rust_tokenizer = True + # determined by the tokenization algorithm and the way it's decoded by the fast tokenizers + space_between_special_tokens = False + test_seq2seq = False + from_pretrained_kwargs = {"cls_token": ""} + + def get_words_and_boxes(self): + words = ["lower", "newer"] + boxes = [[423, 237, 440, 251], [427, 272, 441, 287]] + + return words, boxes + + def get_words_and_boxes_batch(self): + words = [["lower", "newer"], ["new", "low"]] + boxes = [ + [[423, 237, 440, 251], [427, 272, 441, 287]], + [[961, 885, 992, 912], [256, 38, 330, 58]], + ] + + return words, boxes + + def get_question_words_and_boxes(self): + question = "what's his name?" + words = ["lower", "newer"] + boxes = [[423, 237, 440, 251], [427, 272, 441, 287]] + + return question, words, boxes + + def get_question_words_and_boxes_batch(self): + questions = ["what's his name?", "how is he called?"] + words = [["lower", "newer"], ["newer", "lower"]] + boxes = [ + [[423, 237, 440, 251], [427, 272, 441, 287]], + [[256, 38, 330, 58], [256, 38, 330, 58]], + ] + + return questions, words, boxes + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt + vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "\u0120", + "\u0120l", + "\u0120n", + "\u0120lo", + "\u0120low", + "er", + "\u0120lowest", + "\u0120newer", + "\u0120wider", + "", + ] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] + cls.special_tokens_map = {"unk_token": ""} + + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + with open(cls.merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return LayoutLMv3TokenizerFast.from_pretrained(pretrained_name, **kwargs) + + def get_input_output_texts(self, tokenizer): + input_text = "lower newer" + output_text = "lower newer" + return input_text, output_text + + @unittest.skip(reason="Chat template tests don't play well with table/layout models.") + def test_chat_template_batched(self): + pass + + def test_full_tokenizer(self): + tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map) + text = "lower newer" + bpe_tokens = ["Ġlow", "er", "Ġ", "n", "e", "w", "er"] + tokens = tokenizer.tokenize(text) # , add_prefix_space=True) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [tokenizer.unk_token] + input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + @slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutlmv3-base") + + question, words, boxes = self.get_question_words_and_boxes() + + text = tokenizer.encode( + question.split(), + boxes=[tokenizer.pad_token_box for _ in range(len(question.split()))], + add_special_tokens=False, + ) + text_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_pair == [0] + text + [2] + [2] + text_2 + [2] + + def test_add_special_tokens(self): + tokenizers: list[LayoutLMv3Tokenizer] = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + special_token = "[SPECIAL_TOKEN]" + special_token_box = [1000, 1000, 1000, 1000] + + tokenizer.add_special_tokens({"cls_token": special_token}) + encoded_special_token = tokenizer.encode( + [special_token], boxes=[special_token_box], add_special_tokens=False + ) + self.assertEqual(len(encoded_special_token), 1) + + decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True) + self.assertTrue(special_token not in decoded) + + def test_add_tokens_tokenizer(self): + tokenizers: list[LayoutLMv3Tokenizer] = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + vocab_size = tokenizer.vocab_size + all_size = len(tokenizer) + + self.assertNotEqual(vocab_size, 0) + + # We usually have added tokens from the start in tests because our vocab fixtures are + # smaller than the original vocabs - let's not assert this + # self.assertEqual(vocab_size, all_size) + + new_toks = ["aaaaa", "bbbbbb", "cccccccccdddddddd"] + added_toks = tokenizer.add_tokens(new_toks) + vocab_size_2 = tokenizer.vocab_size + all_size_2 = len(tokenizer) + + self.assertNotEqual(vocab_size_2, 0) + self.assertEqual(vocab_size, vocab_size_2) + self.assertEqual(added_toks, len(new_toks)) + self.assertEqual(all_size_2, all_size + len(new_toks)) + + words = "aaaaa bbbbbb low cccccccccdddddddd l".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + + tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + + self.assertGreaterEqual(len(tokens), 4) + self.assertGreater(tokens[0], tokenizer.vocab_size - 1) + self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) + + new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"} + added_toks_2 = tokenizer.add_special_tokens(new_toks_2) + vocab_size_3 = tokenizer.vocab_size + all_size_3 = len(tokenizer) + + self.assertNotEqual(vocab_size_3, 0) + self.assertEqual(vocab_size, vocab_size_3) + self.assertEqual(added_toks_2, len(new_toks_2)) + self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) + + words = ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + + tokens = tokenizer.encode( + words, + boxes=boxes, + add_special_tokens=False, + ) + + self.assertGreaterEqual(len(tokens), 6) + self.assertGreater(tokens[0], tokenizer.vocab_size - 1) + self.assertGreater(tokens[0], tokens[1]) + self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) + self.assertGreater(tokens[-2], tokens[-3]) + self.assertEqual(tokens[0], tokenizer.eos_token_id) + self.assertEqual(tokens[-2], tokenizer.pad_token_id) + + @require_tokenizers + def test_encode_decode_with_spaces(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)] + tokenizer.add_tokens(new_toks) + input = "[ABC][DEF][ABC][DEF]" + if self.space_between_special_tokens: + output = "[ABC] [DEF] [ABC] [DEF]" + else: + output = input + encoded = tokenizer.encode(input.split(), boxes=boxes, add_special_tokens=False) + decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens) + self.assertIn(decoded, [output, output.lower()]) + + @unittest.skip(reason="Not implemented") + def test_right_and_left_truncation(self): + pass + + @unittest.skip(reason="Not implemented") + def test_split_special_tokens(self): + pass + + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + padding_size = 10 + padding_idx = tokenizer.pad_token_id + + encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_special_tokens_mask=True) + input_ids = encoded_sequence["input_ids"] + special_tokens_mask = encoded_sequence["special_tokens_mask"] + sequence_length = len(input_ids) + + # Test 'longest' and 'no_padding' don't do anything + tokenizer.padding_side = "right" + + not_padded_sequence = tokenizer.encode_plus( + words, + boxes=boxes, + padding=False, + return_special_tokens_mask=True, + ) + not_padded_input_ids = not_padded_sequence["input_ids"] + + not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] + not_padded_sequence_length = len(not_padded_input_ids) + + self.assertTrue(sequence_length == not_padded_sequence_length) + self.assertTrue(input_ids == not_padded_input_ids) + self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) + + not_padded_sequence = tokenizer.encode_plus( + words, + boxes=boxes, + padding=False, + return_special_tokens_mask=True, + ) + not_padded_input_ids = not_padded_sequence["input_ids"] + + not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] + not_padded_sequence_length = len(not_padded_input_ids) + + self.assertTrue(sequence_length == not_padded_sequence_length) + self.assertTrue(input_ids == not_padded_input_ids) + self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) + + # Test right padding + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" + + right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right) + right_padded_input_ids = right_padded_sequence["input_ids"] + + right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] + right_padded_sequence_length = len(right_padded_input_ids) + + self.assertTrue(sequence_length + padding_size == right_padded_sequence_length) + self.assertTrue(input_ids + [padding_idx] * padding_size == right_padded_input_ids) + self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask) + + # Test left padding + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left) + left_padded_input_ids = left_padded_sequence["input_ids"] + left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] + left_padded_sequence_length = len(left_padded_input_ids) + + self.assertTrue(sequence_length + padding_size == left_padded_sequence_length) + self.assertTrue([padding_idx] * padding_size + input_ids == left_padded_input_ids) + self.assertTrue([1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask) + + if "token_type_ids" in tokenizer.model_input_names: + token_type_ids = encoded_sequence["token_type_ids"] + left_padded_token_type_ids = left_padded_sequence["token_type_ids"] + right_padded_token_type_ids = right_padded_sequence["token_type_ids"] + + assert token_type_ids + [0] * padding_size == right_padded_token_type_ids + assert [0] * padding_size + token_type_ids == left_padded_token_type_ids + + if "attention_mask" in tokenizer.model_input_names: + attention_mask = encoded_sequence["attention_mask"] + right_padded_attention_mask = right_padded_sequence["attention_mask"] + left_padded_attention_mask = left_padded_sequence["attention_mask"] + + self.assertTrue(attention_mask + [0] * padding_size == right_padded_attention_mask) + self.assertTrue([0] * padding_size + attention_mask == left_padded_attention_mask) + + def test_internal_consistency(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + tokens = [] + for word in words: + tokens.extend(tokenizer.tokenize(word)) + ids = tokenizer.convert_tokens_to_ids(tokens) + ids_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + self.assertListEqual(ids, ids_2) + + tokens_2 = tokenizer.convert_ids_to_tokens(ids) + self.assertNotEqual(len(tokens_2), 0) + text_2 = tokenizer.decode(ids) + self.assertIsInstance(text_2, str) + + output_text = " lower newer" + self.assertEqual(text_2, output_text) + + def test_mask_output(self): + tokenizers = self.get_tokenizers(fast=False, do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + if ( + tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer" + and "token_type_ids" in tokenizer.model_input_names + ): + information = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True) + sequences, mask = information["input_ids"], information["token_type_ids"] + self.assertEqual(len(sequences), len(mask)) + + def test_number_of_added_tokens(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # test 1: single sequence + words, boxes = self.get_words_and_boxes() + + sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + attached_sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=True) + + # Method is implemented (e.g. not GPT-2) + if len(attached_sequences) != 2: + self.assertEqual( + tokenizer.num_special_tokens_to_add(pair=False), len(attached_sequences) - len(sequences) + ) + + # test 2: two sequences + question, words, boxes = self.get_question_words_and_boxes() + + sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=False) + attached_sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=True) + + # Method is implemented (e.g. not GPT-2) + if len(attached_sequences) != 2: + self.assertEqual( + tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences) + ) + + def test_padding_to_max_length(self): + """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated""" + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + padding_size = 10 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + padding_idx = tokenizer.pad_token_id + + # Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "right" + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + # FIXME: the next line should be padding(max_length) to avoid warning + padded_sequence = tokenizer.encode( + words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True + ) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert encoded_sequence + [padding_idx] * padding_size == padded_sequence + + # Check that nothing is done when a maximum length is not specified + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True) + padded_sequence_right_length = len(padded_sequence_right) + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + + def test_padding(self, max_length=50): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + + self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) + pad_token_id = tokenizer_p.pad_token_id + + # Encode - Simple input + words, boxes = self.get_words_and_boxes() + input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length") + input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length") + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.encode(words, boxes=boxes, padding="longest") + input_p = tokenizer_p.encode(words, boxes=boxes, padding=True) + self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) + + # Encode - Pair input + question, words, boxes = self.get_question_words_and_boxes() + input_r = tokenizer_r.encode( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") + input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode(question, words, boxes=boxes, padding=True) + input_p = tokenizer_p.encode(question, words, boxes=boxes, padding="longest") + self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) + + # Encode_plus - Simple input + words, boxes = self.get_words_and_boxes() + input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") + input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + input_r = tokenizer_r.encode_plus(words, boxes=boxes, padding="longest") + input_p = tokenizer_p.encode_plus(words, boxes=boxes, padding=True) + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + # Encode_plus - Pair input + question, words, boxes = self.get_question_words_and_boxes() + input_r = tokenizer_r.encode_plus( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode_plus( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus( + question, words, boxes=boxes, max_length=max_length, padding="max_length" + ) + input_p = tokenizer_p.encode_plus( + question, words, boxes=boxes, max_length=max_length, padding="max_length" + ) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus(question, words, boxes=boxes, padding="longest") + input_p = tokenizer_p.encode_plus(question, words, boxes=boxes, padding=True) + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + # Batch_encode_plus - Simple input + words, boxes = self.get_words_and_boxes_batch() + + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + pad_to_max_length=True, + ) + input_p = tokenizer_p.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + pad_to_max_length=True, + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding="max_length", + ) + input_p = tokenizer_p.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding="max_length", + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding="longest", + ) + input_p = tokenizer_p.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding=True, + ) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes, padding="longest") + input_p = tokenizer_p.batch_encode_plus(words, boxes=boxes, padding=True) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Batch_encode_plus - Pair input + questions, words, boxes = self.get_question_words_and_boxes_batch() + + input_r = tokenizer_r.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + max_length=max_length, + truncation=True, + padding="max_length", + ) + input_p = tokenizer_p.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + max_length=max_length, + truncation=True, + padding="max_length", + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + padding=True, + ) + input_p = tokenizer_p.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + padding="longest", + ) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Using pad on single examples after tokenization + words, boxes = self.get_words_and_boxes() + input_r = tokenizer_r.encode_plus(words, boxes=boxes) + input_r = tokenizer_r.pad(input_r) + + input_p = tokenizer_r.encode_plus(words, boxes=boxes) + input_p = tokenizer_r.pad(input_p) + + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + + # Using pad on single examples after tokenization + input_r = tokenizer_r.encode_plus(words, boxes=boxes) + input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") + + input_p = tokenizer_r.encode_plus(words, boxes=boxes) + input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") + + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + + # Using pad after tokenization + words, boxes = self.get_words_and_boxes_batch() + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_r = tokenizer_r.pad(input_r) + + input_p = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_p = tokenizer_r.pad(input_p) + + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Using pad after tokenization + words, boxes = self.get_words_and_boxes_batch() + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") + + input_p = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") + + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + def test_padding_warning_message_fast_tokenizer(self): + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + words, boxes = self.get_words_and_boxes_batch() + + tokenizer_fast = self.get_rust_tokenizer() + + encoding_fast = tokenizer_fast( + words, + boxes=boxes, + ) + + with self.assertLogs("transformers", level="WARNING") as cm: + tokenizer_fast.pad(encoding_fast) + self.assertEqual(len(cm.records), 1) + self.assertIn( + "Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to" + " encode the text followed by a call to the `pad` method to get a padded encoding.", + cm.records[0].message, + ) + + if not self.test_slow_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer_slow = self.get_tokenizer() + + encoding_slow = tokenizer_slow( + words, + boxes=boxes, + ) + + with self.assertLogs(level="WARNING") as cm: + # We want to assert there are no warnings, but the 'assertLogs' method does not support that. + # Therefore, we are adding a dummy warning, and then we will assert it is the only warning. + logger.warning("Dummy warning") + tokenizer_slow.pad(encoding_slow) + self.assertEqual(len(cm.records), 1) + self.assertIn( + "Dummy warning", + cm.records[0].message, + ) + + def test_call(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Test not batched + words, boxes = self.get_words_and_boxes() + encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes) + encoded_sequences_2 = tokenizer(words, boxes=boxes) + self.assertEqual(encoded_sequences_1, encoded_sequences_2) + + # Test not batched pairs + question, words, boxes = self.get_question_words_and_boxes() + encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes) + encoded_sequences_2 = tokenizer(words, boxes=boxes) + self.assertEqual(encoded_sequences_1, encoded_sequences_2) + + # Test batched + words, boxes = self.get_words_and_boxes_batch() + encoded_sequences_1 = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes) + encoded_sequences_2 = tokenizer(words, boxes=boxes) + self.assertEqual(encoded_sequences_1, encoded_sequences_2) + + def test_batch_encode_plus_batch_sequence_length(self): + # Tests that all encoded values have the correct size + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes_batch() + + encoded_sequences = [ + tokenizer.encode_plus(words_example, boxes=boxes_example) + for words_example, boxes_example in zip(words, boxes) + ] + encoded_sequences_batch = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes, padding=False) + self.assertListEqual( + encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch) + ) + + maximum_length = len( + max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len) + ) + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + encoded_sequences_padded = [ + tokenizer.encode_plus( + words_example, boxes=boxes_example, max_length=maximum_length, padding="max_length" + ) + for words_example, boxes_example in zip(words, boxes) + ] + + encoded_sequences_batch_padded = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, padding=True + ) + self.assertListEqual( + encoded_sequences_padded, + self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded), + ) + + # check 'longest' is unsensitive to a max length + encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, padding=True + ) + encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest" + ) + for key in encoded_sequences_batch_padded_1.keys(): + self.assertListEqual( + encoded_sequences_batch_padded_1[key], + encoded_sequences_batch_padded_2[key], + ) + + # check 'no_padding' is unsensitive to a max length + encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, padding=False + ) + encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False + ) + for key in encoded_sequences_batch_padded_1.keys(): + self.assertListEqual( + encoded_sequences_batch_padded_1[key], + encoded_sequences_batch_padded_2[key], + ) + + @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.") + def test_batch_encode_plus_overflowing_tokens(self): + pass + + def test_batch_encode_plus_padding(self): + # Test that padded sequences are equivalent between batch_encode_plus and encode_plus + + # Right padding tests + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes_batch() + + max_length = 100 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + encoded_sequences = [ + tokenizer.encode_plus( + words_example, boxes=boxes_example, max_length=max_length, padding="max_length" + ) + for words_example, boxes_example in zip(words, boxes) + ] + encoded_sequences_batch = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length" + ) + self.assertListEqual( + encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch) + ) + + # Left padding tests + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + tokenizer.padding_side = "left" + words, boxes = self.get_words_and_boxes_batch() + + max_length = 100 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + encoded_sequences = [ + tokenizer.encode_plus( + words_example, boxes=boxes_example, max_length=max_length, padding="max_length" + ) + for words_example, boxes_example in zip(words, boxes) + ] + encoded_sequences_batch = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length" + ) + self.assertListEqual( + encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch) + ) + + def test_padding_to_multiple_of(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + if tokenizer.pad_token is None: + self.skipTest(reason="No padding token.") + else: + words, boxes = self.get_words_and_boxes() + + # empty_tokens = tokenizer([""], [[]], padding=True, pad_to_multiple_of=8) + normal_tokens = tokenizer(words, boxes=boxes, padding=True, pad_to_multiple_of=8) + # for key, value in empty_tokens.items(): + # self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + for key, value in normal_tokens.items(): + self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + normal_tokens = tokenizer(words, boxes=boxes, pad_to_multiple_of=8) + for key, value in normal_tokens.items(): + self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + # Should also work with truncation + normal_tokens = tokenizer(words, boxes=boxes, padding=True, truncation=True, pad_to_multiple_of=8) + for key, value in normal_tokens.items(): + self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + # truncation to something which is not a multiple of pad_to_multiple_of raises an error + self.assertRaises( + ValueError, + tokenizer.__call__, + words, + boxes=boxes, + padding=True, + truncation=True, + max_length=12, + pad_to_multiple_of=8, + ) + + def test_tokenizer_slow_store_full_signature(self): + signature = inspect.signature(self.tokenizer_class.__init__) + tokenizer = self.get_tokenizer() + + for parameter_name, parameter in signature.parameters.items(): + if parameter.default != inspect.Parameter.empty: + self.assertIn(parameter_name, tokenizer.init_kwargs) + + def test_build_inputs_with_special_tokens(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_rust_tokenizer is set to False") + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + + # Input tokens id + words, boxes = self.get_words_and_boxes() + input_simple = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False) + input_pair = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False) + + # Generate output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) + self.assertEqual(output_p, output_r) + + # Generate pair output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) + self.assertEqual(output_p, output_r) + + def test_special_tokens_mask_input_pairs(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + encoded_sequence_dict = tokenizer.encode_plus( + words, + boxes=boxes, + add_special_tokens=True, + return_special_tokens_mask=True, + # add_prefix_space=False, + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + + filtered_sequence = [ + (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) + ] + filtered_sequence = [x for x in filtered_sequence if x is not None] + self.assertEqual(encoded_sequence, filtered_sequence) + + def test_special_tokens_mask(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + # Testing single inputs + encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + encoded_sequence_dict = tokenizer.encode_plus( + words, boxes=boxes, add_special_tokens=True, return_special_tokens_mask=True + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + + filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]] + self.assertEqual(encoded_sequence, filtered_sequence) + + def test_save_and_load_tokenizer(self): + # safety check on max_len default value so we are sure the test works + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + self.assertNotEqual(tokenizer.model_max_length, 42) + + # Now let's start the test + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Isolate this from the other tests because we save additional tokens/etc + words, boxes = self.get_words_and_boxes() + tmpdirname = tempfile.mkdtemp() + + before_tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + before_vocab = tokenizer.get_vocab() + tokenizer.save_pretrained(tmpdirname) + + after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) + after_tokens = after_tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + after_vocab = after_tokenizer.get_vocab() + self.assertListEqual(before_tokens, after_tokens) + self.assertDictEqual(before_vocab, after_vocab) + + shutil.rmtree(tmpdirname) + + def test_right_and_left_padding(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + sequence = "Sequence" + padding_size = 10 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, sequence) + + padding_idx = tokenizer.pad_token_id + + # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "right" + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode( + words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length" + ) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert encoded_sequence + [padding_idx] * padding_size == padded_sequence + + # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "left" + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode( + words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length" + ) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert [padding_idx] * padding_size + encoded_sequence == padded_sequence + + # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding' + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(words, boxes=boxes, padding=True) + padded_sequence_right_length = len(padded_sequence_right) + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + + tokenizer.padding_side = "left" + padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding="longest") + padded_sequence_left_length = len(padded_sequence_left) + assert sequence_length == padded_sequence_left_length + assert encoded_sequence == padded_sequence_left + + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(words, boxes=boxes) + padded_sequence_right_length = len(padded_sequence_right) + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + + tokenizer.padding_side = "left" + padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding=False) + padded_sequence_left_length = len(padded_sequence_left) + assert sequence_length == padded_sequence_left_length + assert encoded_sequence == padded_sequence_left + + def test_token_type_ids(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # test 1: single sequence + words, boxes = self.get_words_and_boxes() + + output = tokenizer(words, boxes=boxes, return_token_type_ids=True) + + # Assert that the token type IDs have the same length as the input IDs + self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"])) + + # Assert that the token type IDs have the same length as the attention mask + self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"])) + + self.assertIn(0, output["token_type_ids"]) + self.assertNotIn(1, output["token_type_ids"]) + + # test 2: two sequences (question + words) + question, words, boxes = self.get_question_words_and_boxes() + + output = tokenizer(question, words, boxes, return_token_type_ids=True) + + # Assert that the token type IDs have the same length as the input IDs + self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"])) + + # Assert that the token type IDs have the same length as the attention mask + self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"])) + + self.assertIn(0, output["token_type_ids"]) + + def test_offsets_mapping(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + text = ["a", "wonderful", "test"] + boxes = [[1, 8, 12, 20] for _ in range(len(text))] + + # No pair + tokens_with_offsets = tokenizer_r.encode_plus( + text, + boxes=boxes, + return_special_tokens_mask=True, + return_offsets_mapping=True, + add_special_tokens=True, + ) + added_tokens = tokenizer_r.num_special_tokens_to_add(False) + offsets = tokens_with_offsets["offset_mapping"] + + # Assert there is the same number of tokens and offsets + self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) + + # Assert there is online added_tokens special_tokens + self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) + + # Pairs + text = "what's his name" + pair = ["a", "wonderful", "test"] + boxes = [[1, 8, 12, 20] for _ in range(len(pair))] + tokens_with_offsets = tokenizer_r.encode_plus( + text, + pair, + boxes=boxes, + return_special_tokens_mask=True, + return_offsets_mapping=True, + add_special_tokens=True, + ) + added_tokens = tokenizer_r.num_special_tokens_to_add(True) + offsets = tokens_with_offsets["offset_mapping"] + + # Assert there is the same number of tokens and offsets + self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) + + # Assert there is online added_tokens special_tokens + self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) + + @require_torch + @slow + def test_torch_encode_plus_sent_to_model(self): + import torch + + from transformers import MODEL_MAPPING, TOKENIZER_MAPPING + + MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING) + + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: + self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING") + + config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__] + config = config_class() + + if config.is_encoder_decoder or config.pad_token_id is None: + self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.") + + model = model_class(config) + + # Make sure the model contains at least the full vocabulary size in its embedding matrix + is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight") + assert ( + (model.get_input_embeddings().weight.shape[0] >= len(tokenizer)) + if is_using_common_embeddings + else True + ) + + # Build sequence + words, boxes = self.get_words_and_boxes() + encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt") + batch_encoded_sequence = tokenizer.batch_encode_plus( + [words, words], boxes=[boxes, boxes], return_tensors="pt" + ) + + # We add dummy pixel_values keys (as LayoutLMv3 actually also requires a feature extractor + # to prepare the image input) + encoded_sequence["pixel_values"] = torch.randn(1, 3, 224, 224) + batch_encoded_sequence["pixel_values"] = torch.randn(2, 3, 224, 224) + + # This should not fail + with torch.no_grad(): # saves some time + model(**encoded_sequence) + model(**batch_encoded_sequence) + + def test_rust_and_python_full_tokenizers(self): + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_slow_tokenizer is set to False") + + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + words, boxes = self.get_words_and_boxes() + + ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=True) + rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True) + self.assertListEqual(ids, rust_ids) + + def test_tokenization_python_rust_equals(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_slow_tokenizer is set to False") + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + + words, boxes = self.get_words_and_boxes() + + # Ensure basic input match + input_p = tokenizer_p.encode_plus(words, boxes=boxes) + input_r = tokenizer_r.encode_plus(words, boxes=boxes) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_p[key], input_r[key]) + + input_pairs_p = tokenizer_p.encode_plus(words, boxes=boxes) + input_pairs_r = tokenizer_r.encode_plus(words, boxes=boxes) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key]) + + words = ["hello" for _ in range(1000)] + boxes = [[1000, 1000, 1000, 1000] for _ in range(1000)] + + # Ensure truncation match + input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=512, truncation=True) + input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=512, truncation=True) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_p[key], input_r[key]) + + # Ensure truncation with stride match + input_p = tokenizer_p.encode_plus( + words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True + ) + input_r = tokenizer_r.encode_plus( + words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True + ) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_p[key], input_r[key][0]) + + def test_embeded_special_tokens(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_slow_tokenizer is set to False") + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + words, boxes = self.get_words_and_boxes() + tokens_r = tokenizer_r.encode_plus( + words, + boxes=boxes, + add_special_tokens=True, + ) + tokens_p = tokenizer_p.encode_plus( + words, + boxes=boxes, + add_special_tokens=True, + ) + + for key in tokens_p.keys(): + self.assertEqual(tokens_r[key], tokens_p[key]) + + if "token_type_ids" in tokens_r: + self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"])) + + tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"]) + tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"]) + self.assertSequenceEqual(tokens_r, tokens_p) + + def test_compare_add_special_tokens(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False) + + words, boxes = self.get_words_and_boxes() + # tokenize() + no_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=False) + with_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=True) + self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add) + + # encode() + no_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=False) + with_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=True) + self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add) + + # encode_plus() + no_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=False) + with_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=True) + for key in no_special_tokens.keys(): + self.assertEqual( + len(no_special_tokens[key]), + len(with_special_tokens[key]) - simple_num_special_tokens_to_add, + ) + + # # batch_encode_plus + words, boxes = self.get_words_and_boxes_batch() + + no_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=False) + with_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=True) + for key in no_special_tokens.keys(): + for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]): + self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add) + + @slow + def test_layoutlmv3_truncation_integration_test(self): + words, boxes = self.get_words_and_boxes() + + tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", model_max_length=512) + + for i in range(12, 512): + new_encoded_inputs = tokenizer.encode(words, boxes=boxes, max_length=i, truncation=True) + + # Ensure that the input IDs are less than the max length defined. + self.assertLessEqual(len(new_encoded_inputs), i) + + tokenizer.model_max_length = 20 + new_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True) + dropped_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True) + + # Ensure that the input IDs are still truncated when no max_length is specified + self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs) + self.assertLessEqual(len(new_encoded_inputs), 20) + + def test_sequence_ids(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + if not tokenizer.is_fast: + continue + with self.subTest(f"{tokenizer.__class__.__name__}"): + seq_0 = "Test this method." + seq_1 = ["With", "these", "inputs."] + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(seq_1))] + + # We want to have sequence 0 and sequence 1 are tagged + # respectively with 0 and 1 token_ids + # (regardless of whether the model use token type ids) + # We use this assumption in the QA pipeline among other place + output = tokenizer(seq_0.split(), boxes=boxes) + self.assertIn(0, output.sequence_ids()) + + output = tokenizer(seq_0, seq_1, boxes=boxes) + self.assertIn(0, output.sequence_ids()) + self.assertIn(1, output.sequence_ids()) + + if tokenizer.num_special_tokens_to_add(pair=True): + self.assertIn(None, output.sequence_ids()) + + def test_special_tokens_initialization(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + added_tokens = [AddedToken("", lstrip=True)] + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + words = "Hey this is a token".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + r_output = tokenizer_r.encode(words, boxes=boxes) + + special_token_id = tokenizer_r.encode( + [""], boxes=[1000, 1000, 1000, 1000], add_special_tokens=False + )[0] + + self.assertTrue(special_token_id in r_output) + + if self.test_slow_tokenizer: + tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True + ) + tokenizer_p = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + + words = "Hey this is a token".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + + p_output = tokenizer_p.encode(words, boxes=boxes) + cr_output = tokenizer_cr.encode(words, boxes=boxes) + + self.assertEqual(p_output, r_output) + self.assertEqual(cr_output, r_output) + self.assertTrue(special_token_id in p_output) + self.assertTrue(special_token_id in cr_output) + + def test_training_new_tokenizer(self): + # This feature only exists for fast tokenizers + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_rust_tokenizer() + new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100) + + # Test we can use the new tokenizer with something not seen during training + text = [["this", "is", "the"], ["how", "are", "you"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8], [1, 3, 4, 8]], [[5, 6, 7, 8], [4, 5, 6, 7], [3, 9, 2, 7]]] + inputs = new_tokenizer(text, boxes=boxes) + self.assertEqual(len(inputs["input_ids"]), 2) + decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) + expected_result = " this is the" + + if tokenizer.backend_tokenizer.normalizer is not None: + expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result) + self.assertEqual(expected_result, decoded_input) + + # We check that the parameters of the tokenizer remained the same + # Check we have the same number of added_tokens for both pair and non-pair inputs. + self.assertEqual(tokenizer.num_special_tokens_to_add(False), new_tokenizer.num_special_tokens_to_add(False)) + self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True)) + + # Check we have the correct max_length for both pair and non-pair inputs. + self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence) + self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair) + + # Assert the set of special tokens match as we didn't ask to change them + self.assertSequenceEqual( + tokenizer.all_special_tokens_extended, + new_tokenizer.all_special_tokens_extended, + ) + + self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map) + + def test_training_new_tokenizer_with_special_tokens_change(self): + # This feature only exists for fast tokenizers + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_rust_tokenizer() + # Test with a special tokens map + class_signature = inspect.signature(tokenizer.__class__) + if "cls_token" in class_signature.parameters: + new_tokenizer = tokenizer.train_new_from_iterator( + SMALL_TRAINING_CORPUS, 100, special_tokens_map={tokenizer.cls_token: ""} + ) + cls_id = new_tokenizer.get_vocab()[""] + self.assertEqual(new_tokenizer.cls_token, "") + self.assertEqual(new_tokenizer.cls_token_id, cls_id) + + # Create a new mapping from the special tokens defined in the original tokenizer + special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy() + special_tokens_list.remove("additional_special_tokens") + special_tokens_map = {} + for token in special_tokens_list: + # Get the private one to avoid unnecessary warnings. + if getattr(tokenizer, token) is not None: + special_token = getattr(tokenizer, token) + special_tokens_map[special_token] = f"{special_token}a" + + # Train new tokenizer + new_tokenizer = tokenizer.train_new_from_iterator( + SMALL_TRAINING_CORPUS, 100, special_tokens_map=special_tokens_map + ) + + # Check the changes + for token in special_tokens_list: + # Get the private one to avoid unnecessary warnings. + if getattr(tokenizer, token) is None: + continue + special_token = getattr(tokenizer, token) + if special_token in special_tokens_map: + new_special_token = getattr(new_tokenizer, token) + self.assertEqual(special_tokens_map[special_token], new_special_token) + + new_id = new_tokenizer.get_vocab()[new_special_token] + self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id) + + # Check if the AddedToken / string format has been kept + for special_token in tokenizer.all_special_tokens_extended: + if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map: + # The special token must appear identically in the list of the new tokenizer. + self.assertTrue( + special_token in new_tokenizer.all_special_tokens_extended, + f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}", + ) + elif isinstance(special_token, AddedToken): + # The special token must appear in the list of the new tokenizer as an object of type AddedToken with + # the same parameters as the old AddedToken except the content that the user has requested to change. + special_token_str = special_token.content + new_special_token_str = special_tokens_map[special_token_str] + + find = False + for candidate in new_tokenizer.all_special_tokens_extended: + if ( + isinstance(candidate, AddedToken) + and candidate.content == new_special_token_str + and candidate.lstrip == special_token.lstrip + and candidate.rstrip == special_token.rstrip + and candidate.normalized == special_token.normalized + and candidate.single_word == special_token.single_word + ): + find = True + break + self.assertTrue( + find, + f"'{new_special_token_str}' doesn't appear in the list " + f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as " + f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}", + ) + elif special_token not in special_tokens_map: + # The special token must appear identically in the list of the new tokenizer. + self.assertTrue( + special_token in new_tokenizer.all_special_tokens_extended, + f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}", + ) + + else: + # The special token must appear in the list of the new tokenizer as an object of type string. + self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended) + + # Test we can use the new tokenizer with something not seen during training + words = [["this", "is"], ["hello", "🤗"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[1, 2, 3, 4], [5, 6, 7, 8]]] + inputs = new_tokenizer(words, boxes=boxes) + self.assertEqual(len(inputs["input_ids"]), 2) + decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) + expected_result = " this is" + + if tokenizer.backend_tokenizer.normalizer is not None: + expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result) + self.assertEqual(expected_result, decoded_input) + + def test_prepare_for_model(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + # only test prepare_for_model for the slow tokenizer + if tokenizer.__class__.__name__ == "LayoutLMv3TokenizerFast": + continue + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + prepared_input_dict = tokenizer.prepare_for_model(words, boxes=boxes, add_special_tokens=True) + + input_dict = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True) + + self.assertEqual(input_dict, prepared_input_dict) + + def test_padding_different_model_input_name(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_slow_tokenizer is set to False") + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) + pad_token_id = tokenizer_p.pad_token_id + + words, boxes = self.get_words_and_boxes_batch() + + input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes) + input_p = tokenizer_r.batch_encode_plus(words, boxes=boxes) + + # rename encoded batch to "inputs" + input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]] + del input_r[tokenizer_r.model_input_names[0]] + + input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]] + del input_p[tokenizer_p.model_input_names[0]] + + # Renaming `input_ids` to `inputs` + tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:] + tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:] + + input_r = tokenizer_r.pad(input_r, padding="longest") + input_p = tokenizer_r.pad(input_p, padding="longest") + + max_length = len(input_p["inputs"][0]) + self.assert_batch_padded_input_match( + input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs" + ) + + def test_batch_encode_dynamic_overflowing(self): + """ + When calling batch_encode with multiple sequences, it can return different number of + overflowing encoding for each sequence: + [ + Sequence 1: [Encoding 1, Encoding 2], + Sequence 2: [Encoding 1], + Sequence 3: [Encoding 1, Encoding 2, ... Encoding N] + ] + This needs to be padded so that it can represented as a tensor + """ + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"): + if is_torch_available(): + returned_tensor = "pt" + elif is_tf_available(): + returned_tensor = "tf" + else: + returned_tensor = "jax" + + # Single example + words = ["HuggingFace", "is", "solving", "NLP", "one", "commit", "at", "a", "time"] + boxes = [[i, i, i, i] for i in range(len(words))] + tokens = tokenizer.encode_plus( + words, + boxes=boxes, + max_length=6, + padding=True, + truncation=True, + return_tensors=returned_tensor, + return_overflowing_tokens=True, + ) + + for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): + if key != "bbox": + self.assertEqual(len(tokens[key].shape), 2) + else: + self.assertEqual(len(tokens[key].shape), 3) + + # Batch of examples + # For these 2 examples, 3 training examples will be created + words_batched = [ + ["HuggingFace", "is", "solving", "NLP", "one", "commit", "at", "a", "time"], + ["Very", "tiny", "input"], + ] + boxes_batched = [[[i, i, i, i] for i in range(len(words_item))] for words_item in words_batched] + tokens = tokenizer.batch_encode_plus( + words_batched, + boxes=boxes_batched, + max_length=6, + padding=True, + truncation="only_first", + return_tensors=returned_tensor, + return_overflowing_tokens=True, + ) + + for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): + if key != "bbox": + self.assertEqual(len(tokens[key].shape), 2) + self.assertEqual(tokens[key].shape[-1], 6) + else: + self.assertEqual(len(tokens[key].shape), 3) + self.assertEqual(tokens[key].shape[-1], 4) + + @unittest.skip(reason="TO DO: overwrite this very extensive test.") + def test_alignement_methods(self): + pass + + def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5): + toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))] + toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks)) + toks = list( + filter( + lambda t: [t[0]] + == tokenizer.encode(t[1].split(" "), boxes=len(t[1]) * [[1, 1, 1, 1]], add_special_tokens=False), + toks, + ) + ) + if max_length is not None and len(toks) > max_length: + toks = toks[:max_length] + if min_length is not None and len(toks) < min_length and len(toks) > 0: + while len(toks) < min_length: + toks = toks + toks + # toks_str = [t[1] for t in toks] + toks_ids = [t[0] for t in toks] + + # Ensure consistency + output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False) + if " " not in output_txt and len(toks_ids) > 1: + output_txt = ( + tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False) + + " " + + tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False) + ) + if with_prefix_space: + output_txt = " " + output_txt + words = output_txt.split(" ") + boxes = [[i, i, i, i] for i in range(len(words))] + output_ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + + return words, boxes, output_ids + + def test_added_token_with_space_before(self): + tokenizer_s = self.get_tokenizer() + tokenizer_f = self.get_rust_tokenizer() + + tokens_to_add = ["AAA", "bbb"] + + words_with_space = [f" {token}" for token in tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys())] + words_without_space = tokens_to_add + list(tokenizer_s.added_tokens_encoder.keys()) + boxes = [[i, i, i, i] for i in range(len(words_with_space))] + + tokens_to_add_formated = [ + AddedToken(token, rstrip=True, lstrip=True, single_word=False) for token in tokens_to_add + ] + tokenizer_s.add_tokens(tokens_to_add_formated) + tokenizer_f.add_tokens(tokens_to_add_formated) + + ids_s = tokenizer_s(words_with_space, boxes=boxes).input_ids + ids_f = tokenizer_f(words_with_space, boxes=boxes).input_ids + + tokens_s = tokenizer_s.convert_ids_to_tokens(ids_s) + tokens_f = tokenizer_f.convert_ids_to_tokens(ids_f) + + ids_s = tokenizer_s(words_without_space, boxes=boxes).input_ids + ids_f = tokenizer_f(words_without_space, boxes=boxes).input_ids + + tokens_s = tokenizer_s.convert_ids_to_tokens(ids_s) + tokens_f = tokenizer_f.convert_ids_to_tokens(ids_f) + + self.assertEqual(tokens_s, tokens_f) + + def test_maximum_encoding_length_pair_input(self): + tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Build a sequence from our model's vocabulary + stride = 2 + seq_0, boxes_0, ids = self.get_clean_sequence(tokenizer, max_length=20) + question_0 = " ".join(map(str, seq_0)) + if len(ids) <= 2 + stride: + seq_0 = (seq_0 + " ") * (2 + stride) + ids = None + + seq0_tokens = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False) + seq0_input_ids = seq0_tokens["input_ids"] + + self.assertGreater(len(seq0_input_ids), 2 + stride) + question_1 = "This is another sentence to be encoded." + seq_1 = ["what", "a", "weird", "test", "weirdly", "weird"] + boxes_1 = [[i, i, i, i] for i in range(1, len(seq_1) + 1)] + seq1_tokens = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False) + if abs(len(seq0_input_ids) - len(seq1_tokens["input_ids"])) <= 2: + seq1_tokens_input_ids = seq1_tokens["input_ids"] + seq1_tokens["input_ids"] + seq_1 = tokenizer.decode(seq1_tokens_input_ids, clean_up_tokenization_spaces=False) + seq_1 = seq_1.split(" ") + boxes_1 = [[i, i, i, i] for i in range(1, len(seq_1) + 1)] + seq1_tokens = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False) + seq1_input_ids = seq1_tokens["input_ids"] + + self.assertGreater(len(seq1_input_ids), 2 + stride) + + smallest = seq1_input_ids if len(seq0_input_ids) > len(seq1_input_ids) else seq0_input_ids + + # We are not using the special tokens - a bit too hard to test all the tokenizers with this + # TODO try this again later + sequence = tokenizer( + question_0, seq_1, boxes=boxes_1, add_special_tokens=False + ) # , add_prefix_space=False) + + # Test with max model input length + model_max_length = tokenizer.model_max_length + self.assertEqual(model_max_length, 100) + seq_2 = seq_0 * model_max_length + question_2 = " ".join(map(str, seq_2)) + boxes_2 = boxes_0 * model_max_length + self.assertGreater(len(seq_2), model_max_length) + + sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False) + total_length1 = len(sequence1["input_ids"]) + sequence2 = tokenizer(question_2, seq_1, boxes=boxes_1, add_special_tokens=False) + total_length2 = len(sequence2["input_ids"]) + self.assertLess(total_length1, model_max_length, "Issue with the testing sequence, please update it.") + self.assertGreater( + total_length2, model_max_length, "Issue with the testing sequence, please update it." + ) + + # Simple + padding_strategies = ( + [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False] + ) + for padding_state in padding_strategies: + with self.subTest(f"{tokenizer.__class__.__name__} Padding: {padding_state}"): + for truncation_state in [True, "longest_first", "only_first"]: + with self.subTest(f"{tokenizer.__class__.__name__} Truncation: {truncation_state}"): + output = tokenizer( + question_2, + seq_1, + boxes=boxes_1, + padding=padding_state, + truncation=truncation_state, + ) + self.assertEqual(len(output["input_ids"]), model_max_length) + self.assertEqual(len(output["bbox"]), model_max_length) + + output = tokenizer( + [question_2], + [seq_1], + boxes=[boxes_1], + padding=padding_state, + truncation=truncation_state, + ) + self.assertEqual(len(output["input_ids"][0]), model_max_length) + self.assertEqual(len(output["bbox"][0]), model_max_length) + + # Simple + output = tokenizer( + question_1, seq_2, boxes=boxes_2, padding=padding_state, truncation="only_second" + ) + self.assertEqual(len(output["input_ids"]), model_max_length) + self.assertEqual(len(output["bbox"]), model_max_length) + + output = tokenizer( + [question_1], [seq_2], boxes=[boxes_2], padding=padding_state, truncation="only_second" + ) + self.assertEqual(len(output["input_ids"][0]), model_max_length) + self.assertEqual(len(output["bbox"][0]), model_max_length) + + # Simple with no truncation + # Reset warnings + tokenizer.deprecation_warnings = {} + with self.assertLogs("transformers", level="WARNING") as cm: + output = tokenizer( + question_1, seq_2, boxes=boxes_2, padding=padding_state, truncation=False + ) + self.assertNotEqual(len(output["input_ids"]), model_max_length) + self.assertNotEqual(len(output["bbox"]), model_max_length) + self.assertEqual(len(cm.records), 1) + self.assertTrue( + cm.records[0].message.startswith( + "Token indices sequence length is longer than the specified maximum sequence length" + " for this model" + ) + ) + + tokenizer.deprecation_warnings = {} + with self.assertLogs("transformers", level="WARNING") as cm: + output = tokenizer( + [question_1], [seq_2], boxes=[boxes_2], padding=padding_state, truncation=False + ) + self.assertNotEqual(len(output["input_ids"][0]), model_max_length) + self.assertNotEqual(len(output["bbox"][0]), model_max_length) + self.assertEqual(len(cm.records), 1) + self.assertTrue( + cm.records[0].message.startswith( + "Token indices sequence length is longer than the specified maximum sequence length" + " for this model" + ) + ) + # Check the order of Sequence of input ids, overflowing tokens and bbox sequence with truncation + truncated_first_sequence = ( + tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"][:-2] + + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"] + ) + truncated_second_sequence = ( + tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"] + + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"][:-2] + ) + truncated_longest_sequence = ( + truncated_first_sequence + if len(seq0_input_ids) > len(seq1_input_ids) + else truncated_second_sequence + ) + + overflow_first_sequence = ( + tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"][-(2 + stride) :] + + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"] + ) + overflow_second_sequence = ( + tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False)["input_ids"] + + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["input_ids"][-(2 + stride) :] + ) + overflow_longest_sequence = ( + overflow_first_sequence if len(seq0_input_ids) > len(seq1_input_ids) else overflow_second_sequence + ) + + bbox_first = [[0, 0, 0, 0]] * (len(seq0_input_ids) - 2) + bbox_first_sequence = bbox_first + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["bbox"] + overflowing_token_bbox_first_sequence_slow = [[0, 0, 0, 0]] * (2 + stride) + overflowing_token_bbox_first_sequence_fast = [[0, 0, 0, 0]] * (2 + stride) + tokenizer( + seq_1, boxes=boxes_1, add_special_tokens=False + )["bbox"] + + bbox_second = [[0, 0, 0, 0]] * len(seq0_input_ids) + bbox_second_sequence = ( + bbox_second + tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False)["bbox"][:-2] + ) + overflowing_token_bbox_second_sequence_slow = tokenizer( + seq_1, boxes=boxes_1, add_special_tokens=False + )["bbox"][-(2 + stride) :] + overflowing_token_bbox_second_sequence_fast = [[0, 0, 0, 0]] * len(seq0_input_ids) + tokenizer( + seq_1, boxes=boxes_1, add_special_tokens=False + )["bbox"][-(2 + stride) :] + + bbox_longest_sequence = ( + bbox_first_sequence if len(seq0_tokens) > len(seq1_tokens) else bbox_second_sequence + ) + overflowing_token_bbox_longest_sequence_fast = ( + overflowing_token_bbox_first_sequence_fast + if len(seq0_tokens) > len(seq1_tokens) + else overflowing_token_bbox_second_sequence_fast + ) + + # Overflowing tokens are handled quite differently in slow and fast tokenizers + if isinstance(tokenizer, LayoutLMv3TokenizerFast): + information = tokenizer( + question_0, + seq_1, + boxes=boxes_1, + max_length=len(sequence["input_ids"]) - 2, + add_special_tokens=False, + stride=stride, + truncation="longest_first", + return_overflowing_tokens=True, + # add_prefix_space=False, + ) + truncated_sequence = information["input_ids"][0] + overflowing_tokens = information["input_ids"][1] + bbox = information["bbox"][0] + overflowing_bbox = information["bbox"][1] + self.assertEqual(len(information["input_ids"]), 2) + + self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2) + self.assertEqual(truncated_sequence, truncated_longest_sequence) + + self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest)) + self.assertEqual(overflowing_tokens, overflow_longest_sequence) + self.assertEqual(bbox, bbox_longest_sequence) + + self.assertEqual(len(overflowing_bbox), 2 + stride + len(smallest)) + self.assertEqual(overflowing_bbox, overflowing_token_bbox_longest_sequence_fast) + else: + # No overflowing tokens when using 'longest' in python tokenizers + with self.assertRaises(ValueError) as context: + information = tokenizer( + question_0, + seq_1, + boxes=boxes_1, + max_length=len(sequence["input_ids"]) - 2, + add_special_tokens=False, + stride=stride, + truncation="longest_first", + return_overflowing_tokens=True, + # add_prefix_space=False, + ) + + self.assertTrue( + context.exception.args[0].startswith( + "Not possible to return overflowing tokens for pair of sequences with the " + "`longest_first`. Please select another truncation strategy than `longest_first`, " + "for instance `only_second` or `only_first`." + ) + ) + + # Overflowing tokens are handled quite differently in slow and fast tokenizers + if isinstance(tokenizer, LayoutLMv3TokenizerFast): + information = tokenizer( + question_0, + seq_1, + boxes=boxes_1, + max_length=len(sequence["input_ids"]) - 2, + add_special_tokens=False, + stride=stride, + truncation=True, + return_overflowing_tokens=True, + # add_prefix_space=False, + ) + truncated_sequence = information["input_ids"][0] + overflowing_tokens = information["input_ids"][1] + bbox = information["bbox"][0] + overflowing_bbox = information["bbox"][1] + self.assertEqual(len(information["input_ids"]), 2) + + self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2) + self.assertEqual(truncated_sequence, truncated_longest_sequence) + + self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest)) + self.assertEqual(overflowing_tokens, overflow_longest_sequence) + self.assertEqual(bbox, bbox_longest_sequence) + self.assertEqual(overflowing_bbox, overflowing_token_bbox_longest_sequence_fast) + else: + # No overflowing tokens when using 'longest' in python tokenizers + with self.assertRaises(ValueError) as context: + information = tokenizer( + question_0, + seq_1, + boxes=boxes_1, + max_length=len(sequence["input_ids"]) - 2, + add_special_tokens=False, + stride=stride, + truncation=True, + return_overflowing_tokens=True, + # add_prefix_space=False, + ) + + self.assertTrue( + context.exception.args[0].startswith( + "Not possible to return overflowing tokens for pair of sequences with the " + "`longest_first`. Please select another truncation strategy than `longest_first`, " + "for instance `only_second` or `only_first`." + ) + ) + + information_first_truncated = tokenizer( + question_0, + seq_1, + boxes=boxes_1, + max_length=len(sequence["input_ids"]) - 2, + add_special_tokens=False, + stride=stride, + truncation="only_first", + return_overflowing_tokens=True, + # add_prefix_space=False, + ) + # Overflowing tokens are handled quite differently in slow and fast tokenizers + if isinstance(tokenizer, LayoutLMv3TokenizerFast): + truncated_sequence = information_first_truncated["input_ids"][0] + overflowing_tokens = information_first_truncated["input_ids"][1] + bbox = information_first_truncated["bbox"][0] + overflowing_bbox = information_first_truncated["bbox"][0] + self.assertEqual(len(information_first_truncated["input_ids"]), 2) + + self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2) + self.assertEqual(truncated_sequence, truncated_first_sequence) + + self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq1_input_ids)) + self.assertEqual(overflowing_tokens, overflow_first_sequence) + self.assertEqual(bbox, bbox_first_sequence) + self.assertEqual(overflowing_bbox, overflowing_token_bbox_first_sequence_fast) + else: + truncated_sequence = information_first_truncated["input_ids"] + overflowing_tokens = information_first_truncated["overflowing_tokens"] + overflowing_bbox = information_first_truncated["overflowing_token_boxes"] + bbox = information_first_truncated["bbox"] + + self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2) + self.assertEqual(truncated_sequence, truncated_first_sequence) + + self.assertEqual(len(overflowing_tokens), 2 + stride) + self.assertEqual(overflowing_tokens, seq0_input_ids[-(2 + stride) :]) + self.assertEqual(bbox, bbox_first_sequence) + self.assertEqual(overflowing_bbox, overflowing_token_bbox_first_sequence_slow) + + information_second_truncated = tokenizer( + question_0, + seq_1, + boxes=boxes_1, + max_length=len(sequence["input_ids"]) - 2, + add_special_tokens=False, + stride=stride, + truncation="only_second", + return_overflowing_tokens=True, + # add_prefix_space=False, + ) + # Overflowing tokens are handled quite differently in slow and fast tokenizers + if isinstance(tokenizer, LayoutLMv3TokenizerFast): + truncated_sequence = information_second_truncated["input_ids"][0] + overflowing_tokens = information_second_truncated["input_ids"][1] + bbox = information_second_truncated["bbox"][0] + overflowing_bbox = information_second_truncated["bbox"][1] + + self.assertEqual(len(information_second_truncated["input_ids"]), 2) + + self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2) + self.assertEqual(truncated_sequence, truncated_second_sequence) + + self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq0_input_ids)) + self.assertEqual(overflowing_tokens, overflow_second_sequence) + self.assertEqual(bbox, bbox_second_sequence) + self.assertEqual(overflowing_bbox, overflowing_token_bbox_second_sequence_fast) + else: + truncated_sequence = information_second_truncated["input_ids"] + overflowing_tokens = information_second_truncated["overflowing_tokens"] + bbox = information_second_truncated["bbox"] + overflowing_bbox = information_second_truncated["overflowing_token_boxes"] + + self.assertEqual(len(truncated_sequence), len(sequence["input_ids"]) - 2) + self.assertEqual(truncated_sequence, truncated_second_sequence) + + self.assertEqual(len(overflowing_tokens), 2 + stride) + self.assertEqual(overflowing_tokens, seq1_input_ids[-(2 + stride) :]) + self.assertEqual(bbox, bbox_second_sequence) + self.assertEqual(overflowing_bbox, overflowing_token_bbox_second_sequence_slow) + + def test_maximum_encoding_length_single_input(self): + tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + seq_0, boxes_0, ids = self.get_clean_sequence(tokenizer, max_length=20) + + sequence = tokenizer(seq_0, boxes=boxes_0, add_special_tokens=False) + total_length = len(sequence["input_ids"]) + + self.assertGreater( + total_length, 4, "Issue with the testing sequence, please update it, it's too short" + ) + + # Test with max model input length + model_max_length = tokenizer.model_max_length + self.assertEqual(model_max_length, 100) + seq_1 = seq_0 * model_max_length + boxes_1 = boxes_0 * model_max_length + sequence1 = tokenizer(seq_1, boxes=boxes_1, add_special_tokens=False) + total_length1 = len(sequence1["input_ids"]) + self.assertGreater( + total_length1, + model_max_length, + "Issue with the testing sequence, please update it, it's too short", + ) + + # Simple + padding_strategies = ( + [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False] + ) + for padding_state in padding_strategies: + with self.subTest(f"Padding: {padding_state}"): + for truncation_state in [True, "longest_first", "only_first"]: + with self.subTest(f"Truncation: {truncation_state}"): + output = tokenizer( + seq_1, + boxes=boxes_1, + padding=padding_state, + truncation=truncation_state, + ) + + self.assertEqual(len(output["input_ids"]), model_max_length) + self.assertEqual(len(output["bbox"]), model_max_length) + + output = tokenizer( + [seq_1], + boxes=[boxes_1], + padding=padding_state, + truncation=truncation_state, + ) + self.assertEqual(len(output["input_ids"][0]), model_max_length) + self.assertEqual(len(output["bbox"][0]), model_max_length) + + # Simple with no truncation + # Reset warnings + tokenizer.deprecation_warnings = {} + with self.assertLogs("transformers", level="WARNING") as cm: + output = tokenizer(seq_1, boxes=boxes_1, padding=padding_state, truncation=False) + self.assertNotEqual(len(output["input_ids"]), model_max_length) + self.assertNotEqual(len(output["bbox"]), model_max_length) + self.assertEqual(len(cm.records), 1) + self.assertTrue( + cm.records[0].message.startswith( + "Token indices sequence length is longer than the specified maximum sequence length" + " for this model" + ) + ) + + tokenizer.deprecation_warnings = {} + with self.assertLogs("transformers", level="WARNING") as cm: + output = tokenizer([seq_1], boxes=[boxes_1], padding=padding_state, truncation=False) + self.assertNotEqual(len(output["input_ids"][0]), model_max_length) + self.assertNotEqual(len(output["bbox"][0]), model_max_length) + self.assertEqual(len(cm.records), 1) + self.assertTrue( + cm.records[0].message.startswith( + "Token indices sequence length is longer than the specified maximum sequence length" + " for this model" + ) + ) + # Check the order of Sequence of input ids, overflowing tokens and bbox sequence with truncation + stride = 2 + information = tokenizer( + seq_0, + boxes=boxes_0, + max_length=total_length - 2, + add_special_tokens=False, + stride=stride, + truncation=True, + return_overflowing_tokens=True, + # add_prefix_space=False, + ) + + # Overflowing tokens are handled quite differently in slow and fast tokenizers + if isinstance(tokenizer, LayoutLMv3TokenizerFast): + truncated_sequence = information["input_ids"][0] + overflowing_tokens = information["input_ids"][1] + # bbox = information["bbox"][0] + # overflowing_bbox = information["bbox"][1] + self.assertEqual(len(information["input_ids"]), 2) + + self.assertEqual(len(truncated_sequence), total_length - 2) + self.assertEqual(truncated_sequence, sequence["input_ids"][:-2]) + + self.assertEqual(len(overflowing_tokens), 2 + stride) + self.assertEqual(overflowing_tokens, sequence["input_ids"][-(2 + stride) :]) + + # self.assertEqual(bbox, sequence["bbox"][:-2]) + # self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :]) + else: + truncated_sequence = information["input_ids"] + overflowing_tokens = information["overflowing_tokens"] + # bbox = information["bbox"] + # overflowing_bbox = information["overflowing_token_boxes"] + self.assertEqual(len(truncated_sequence), total_length - 2) + self.assertEqual(truncated_sequence, sequence["input_ids"][:-2]) + + self.assertEqual(len(overflowing_tokens), 2 + stride) + self.assertEqual(overflowing_tokens, sequence["input_ids"][-(2 + stride) :]) + # self.assertEqual(bbox, sequence["bbox"][:-2]) + # self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :]) + + @unittest.skip(reason="LayoutLMv3 tokenizer requires boxes besides sequences.") + def test_pretokenized_inputs(self): + pass + + @unittest.skip(reason="LayoutLMv3 tokenizer always expects pretokenized inputs.") + def test_compare_pretokenized_inputs(self): + pass + + @unittest.skip(reason="LayoutLMv3 fast tokenizer does not support prepare_for_model") + def test_compare_prepare_for_model(self): + pass + + @slow + def test_only_label_first_subword(self): + words = ["hello", "niels", "0000000000000000"] + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + word_labels = [0, 1, 2] + + # test slow tokenizer + tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False) + encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100]) + + tokenizer_p = LayoutLMv3Tokenizer.from_pretrained( + "microsoft/layoutlmv3-base", + only_label_first_subword=False, + add_visual_labels=False, + ) + encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100]) + + # test fast tokenizer + tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base", add_visual_labels=False) + encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100]) + + tokenizer_r = LayoutLMv3Tokenizer.from_pretrained( + "microsoft/layoutlmv3-base", + only_label_first_subword=False, + add_visual_labels=False, + ) + encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100]) + + @slow + def test_layoutlmv3_integration_test(self): + tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base") + tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base") + + # There are 3 cases: + # CASE 1: document image classification (training + inference), document image token classification (inference), + # in which case only words and normalized bounding boxes are provided to the tokenizer + # CASE 2: document image token classification (training), + # in which case one also provides word labels to the tokenizer + # CASE 3: document image visual question answering (inference), + # in which case one also provides a question to the tokenizer + + # We need to test all 3 cases both on batched and non-batched inputs. + + # CASE 1: not batched + words, boxes = self.get_words_and_boxes() + + expected_results = {'input_ids': [0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # fmt: skip + + encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 1: batched + words, boxes = self.get_words_and_boxes_batch() + + expected_results = {'input_ids': [[0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 92, 614, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip + + encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 2: not batched + words, boxes = self.get_words_and_boxes() + word_labels = [1, 2] + + expected_results = {'input_ids': [0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'labels': [-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # fmt: skip + + encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # # CASE 2: batched + words, boxes = self.get_words_and_boxes_batch() + word_labels = [[1, 2], [2, 46]] + + expected_results = {'input_ids': [[0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 92, 614, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'labels': [[-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [-100, 2, 46, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip + + encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # # CASE 3: not batched + question, words, boxes = self.get_question_words_and_boxes() + + expected_results = {'input_ids': [0, 99, 18, 39, 766, 116, 2, 2, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # fmt: skip + + encoding_p = tokenizer_p(question, words, boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(question, words, boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # # CASE 3: batched + questions, words, boxes = self.get_question_words_and_boxes_batch() + + expected_results = {'input_ids': [[0, 99, 18, 39, 766, 116, 2, 2, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 141, 16, 37, 373, 116, 2, 2, 13964, 795, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [256, 38, 330, 58], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip + + encoding_p = tokenizer_p(questions, words, boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(questions, words, boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + @unittest.skip(reason="Doesn't support another framework than PyTorch") + def test_np_encode_plus_sent_to_model(self): + pass + + @require_tf + @slow + def test_tf_encode_plus_sent_to_model(self): + from transformers import TF_MODEL_MAPPING, TOKENIZER_MAPPING + + MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(TF_MODEL_MAPPING, TOKENIZER_MAPPING) + + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: + self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING") + + config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__] + config = config_class() + + if config.is_encoder_decoder or config.pad_token_id is None: + self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.") + + model = model_class(config) + + # Make sure the model contains at least the full vocabulary size in its embedding matrix + self.assertGreaterEqual(model.config.vocab_size, len(tokenizer)) + + # Build sequence + first_ten_tokens = list(tokenizer.get_vocab().keys())[:10] + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(first_ten_tokens))] + encoded_sequence = tokenizer.encode_plus(first_ten_tokens, boxes=boxes, return_tensors="tf") + batch_encoded_sequence = tokenizer.batch_encode_plus( + [first_ten_tokens, first_ten_tokens], boxes=[boxes, boxes], return_tensors="tf" + ) + + # This should not fail + model(encoded_sequence) + model(batch_encoded_sequence) + + @unittest.skip(reason="Chat is not supported") + def test_chat_template(self): + pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask(self): + pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask_truncated(self): + pass diff --git a/docs/transformers/tests/models/layoutxlm/__init__.py b/docs/transformers/tests/models/layoutxlm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/layoutxlm/test_processor_layoutxlm.py b/docs/transformers/tests/models/layoutxlm/test_processor_layoutxlm.py new file mode 100644 index 0000000000000000000000000000000000000000..57872eda807133cfd4306d5c14c2dce6b31af73a --- /dev/null +++ b/docs/transformers/tests/models/layoutxlm/test_processor_layoutxlm.py @@ -0,0 +1,482 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import tempfile +import unittest + +from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast +from transformers.models.layoutxlm import LayoutXLMProcessor, LayoutXLMTokenizer, LayoutXLMTokenizerFast +from transformers.testing_utils import ( + require_pytesseract, + require_sentencepiece, + require_tokenizers, + require_torch, + slow, +) +from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_pytesseract_available(): + from PIL import Image + + from transformers import LayoutLMv2ImageProcessor + + +@require_pytesseract +@require_sentencepiece +@require_tokenizers +class LayoutXLMProcessorTest(ProcessorTesterMixin, unittest.TestCase): + tokenizer_class = LayoutXLMTokenizer + rust_tokenizer_class = LayoutXLMTokenizerFast + processor_class = LayoutXLMProcessor + + @classmethod + def setUpClass(cls): + image_processor_map = { + "do_resize": True, + "size": 224, + "apply_ocr": True, + } + + cls.tmpdirname = tempfile.mkdtemp() + cls.feature_extraction_file = os.path.join(cls.tmpdirname, FEATURE_EXTRACTOR_NAME) + with open(cls.feature_extraction_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(image_processor_map) + "\n") + + # taken from `test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_save_pretrained` + cls.tokenizer_pretrained_name = "hf-internal-testing/tiny-random-layoutxlm" + + tokenizer = cls.get_tokenizer() + image_processor = cls.get_image_processor() + processor = LayoutXLMProcessor(tokenizer=tokenizer, image_processor=image_processor) + processor.save_pretrained(cls.tmpdirname) + + @classmethod + def get_tokenizer(cls, **kwargs) -> PreTrainedTokenizer: + return cls.tokenizer_class.from_pretrained(cls.tokenizer_pretrained_name, **kwargs) + + @classmethod + def get_rust_tokenizer(cls, **kwargs) -> PreTrainedTokenizerFast: + return cls.rust_tokenizer_class.from_pretrained(cls.tokenizer_pretrained_name, **kwargs) + + @classmethod + def get_tokenizers(cls, **kwargs) -> list[PreTrainedTokenizerBase]: + return [cls.get_tokenizer(**kwargs), cls.get_rust_tokenizer(**kwargs)] + + @classmethod + def get_image_processor(cls, **kwargs): + return LayoutLMv2ImageProcessor.from_pretrained(cls.tmpdirname, **kwargs) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + def test_save_load_pretrained_default(self): + image_processor = self.get_image_processor() + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer) + + with tempfile.TemporaryDirectory() as tmpdir: + processor.save_pretrained(tmpdir) + processor = LayoutXLMProcessor.from_pretrained(tmpdir) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) + self.assertIsInstance(processor.tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast)) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor) + + def test_save_load_pretrained_additional_features(self): + with tempfile.TemporaryDirectory() as tmpdir: + processor = LayoutXLMProcessor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer()) + processor.save_pretrained(tmpdir) + + # slow tokenizer + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30) + + processor = LayoutXLMProcessor.from_pretrained( + tmpdir, + use_fast=False, + bos_token="(BOS)", + eos_token="(EOS)", + do_resize=False, + size=30, + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizer) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor) + + # fast tokenizer + tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30) + + processor = LayoutXLMProcessor.from_pretrained( + self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, LayoutXLMTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, LayoutLMv2ImageProcessor) + + def test_model_input_names(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = LayoutXLMProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # add extra args + inputs = processor(text=input_str, images=image_input, return_codebook_pixels=False, return_image_mask=False) + + self.assertListEqual(list(inputs.keys()), processor.model_input_names) + + @slow + def test_overflowing_tokens(self): + # In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences). + + from datasets import load_dataset + + # set up + datasets = load_dataset("nielsr/funsd", trust_remote_code=True) + processor = LayoutXLMProcessor.from_pretrained("microsoft/layoutxlm-base", apply_ocr=False) + + def preprocess_data(examples): + images = [Image.open(path).convert("RGB") for path in examples["image_path"]] + words = examples["words"] + boxes = examples["bboxes"] + word_labels = examples["ner_tags"] + encoded_inputs = processor( + images, + words, + boxes=boxes, + word_labels=word_labels, + max_length=512, + padding="max_length", + truncation=True, + return_overflowing_tokens=True, + stride=50, + return_offsets_mapping=True, + return_tensors="pt", + ) + return encoded_inputs + + train_data = preprocess_data(datasets["train"]) + + self.assertEqual(len(train_data["image"]), len(train_data["input_ids"])) + + +# different use cases tests +@require_sentencepiece +@require_torch +@require_pytesseract +class LayoutXLMProcessorIntegrationTests(unittest.TestCase): + @cached_property + def get_images(self): + # we verify our implementation on 2 document images from the DocVQA dataset + from datasets import load_dataset + + ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test", trust_remote_code=True) + + image_1 = Image.open(ds[0]["file"]).convert("RGB") + image_2 = Image.open(ds[1]["file"]).convert("RGB") + + return image_1, image_2 + + @cached_property + def get_tokenizers(self): + slow_tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base") + fast_tokenizer = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base") + return [slow_tokenizer, fast_tokenizer] + + @slow + def test_processor_case_1(self): + # case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True + + image_processor = LayoutLMv2ImageProcessor() + tokenizers = self.get_tokenizers + images = self.get_images + + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer) + + # not batched + input_feat_extract = image_processor(images[0], return_tensors="pt") + input_processor = processor(images[0], return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify image + self.assertAlmostEqual( + input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2 + ) + + # verify input_ids + # this was obtained with Tesseract 4.1.1 + expected_decoding = " 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer" # fmt: skip + decoding = processor.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # batched + input_feat_extract = image_processor(images, return_tensors="pt") + input_processor = processor(images, padding=True, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify images + self.assertAlmostEqual( + input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2 + ) + + # verify input_ids + # this was obtained with Tesseract 4.1.1 + expected_decoding = " 7 ITC Limited REPORT AND ACCOUNTS 2013 ITC’s Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC’s value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC’s brands national assets, adding to India’s competitiveness. It is ITC’s aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223" # fmt: skip + decoding = processor.decode(input_processor.input_ids[1].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + @slow + def test_processor_case_2(self): + # case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False + + image_processor = LayoutLMv2ImageProcessor(apply_ocr=False) + tokenizers = self.get_tokenizers + images = self.get_images + + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer) + + # not batched + words = ["hello", "world"] + boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] + input_processor = processor(images[0], words, boxes=boxes, return_tensors="pt") + + # verify keys + expected_keys = ["input_ids", "bbox", "attention_mask", "image"] + actual_keys = list(input_processor.keys()) + for key in expected_keys: + self.assertIn(key, actual_keys) + + # verify input_ids + expected_decoding = " hello world" + decoding = processor.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # batched + words = [["hello", "world"], ["my", "name", "is", "niels"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]] + input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " hello world" + decoding = processor.decode(input_processor.input_ids[0].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify bbox + expected_bbox = [ + [0, 0, 0, 0], + [3, 2, 5, 1], + [6, 7, 4, 2], + [3, 9, 2, 4], + [1, 1, 2, 3], + [1, 1, 2, 3], + [1000, 1000, 1000, 1000], + ] + self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox) + + @slow + def test_processor_case_3(self): + # case 3: token classification (training), apply_ocr=False + + image_processor = LayoutLMv2ImageProcessor(apply_ocr=False) + tokenizers = self.get_tokenizers + images = self.get_images + + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer) + + # not batched + words = ["weirdly", "world"] + boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] + word_labels = [1, 2] + input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " weirdly world" + decoding = processor.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify labels + expected_labels = [-100, 1, -100, 2, -100] + self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels) + + # batched + words = [["hello", "world"], ["my", "name", "is", "niels"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]] + word_labels = [[1, 2], [6, 3, 10, 2]] + input_processor = processor( + images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="pt" + ) + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " my name is niels" + decoding = processor.decode(input_processor.input_ids[1].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify bbox + expected_bbox = [ + [0, 0, 0, 0], + [3, 2, 5, 1], + [6, 7, 4, 2], + [3, 9, 2, 4], + [1, 1, 2, 3], + [1, 1, 2, 3], + [1000, 1000, 1000, 1000], + ] + self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox) + + # verify labels + expected_labels = [-100, 6, 3, 10, 2, -100, -100] + self.assertListEqual(input_processor.labels[1].tolist(), expected_labels) + + @slow + def test_processor_case_4(self): + # case 4: visual question answering (inference), apply_ocr=True + + image_processor = LayoutLMv2ImageProcessor() + tokenizers = self.get_tokenizers + images = self.get_images + + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer) + + # not batched + question = "What's his name?" + input_processor = processor(images[0], question, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + # this was obtained with Tesseract 4.1.1 + expected_decoding = " What's his name? 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer" # fmt: skip + decoding = processor.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # batched + questions = ["How old is he?", "what's the time"] + input_processor = processor( + images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt" + ) + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + # this was obtained with Tesseract 4.1.1 + expected_decoding = " what's the time 7 ITC Limited REPORT AND ACCOUNTS 2013" + decoding = processor.decode(input_processor.input_ids[1].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify bbox + expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [372, 59, 407, 66], [1000, 1000, 1000, 1000]] # fmt: skip + self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox) + + @slow + def test_processor_case_5(self): + # case 5: visual question answering (inference), apply_ocr=False + + image_processor = LayoutLMv2ImageProcessor(apply_ocr=False) + tokenizers = self.get_tokenizers + images = self.get_images + + for tokenizer in tokenizers: + processor = LayoutXLMProcessor(image_processor=image_processor, tokenizer=tokenizer) + + # not batched + question = "What's his name?" + words = ["hello", "world"] + boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] + input_processor = processor(images[0], question, words, boxes, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " What's his name? hello world" + decoding = processor.decode(input_processor.input_ids.squeeze().tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # batched + questions = ["How old is he?", "what's the time"] + words = [["hello", "world"], ["my", "name", "is", "niels"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]] + input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt") + + # verify keys + expected_keys = ["attention_mask", "bbox", "image", "input_ids"] + actual_keys = sorted(input_processor.keys()) + self.assertListEqual(actual_keys, expected_keys) + + # verify input_ids + expected_decoding = " How old is he? hello world" + decoding = processor.decode(input_processor.input_ids[0].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + expected_decoding = " what's the time my name is niels" + decoding = processor.decode(input_processor.input_ids[1].tolist()) + self.assertSequenceEqual(decoding, expected_decoding) + + # verify bbox + expected_bbox = [[6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]] + self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox) diff --git a/docs/transformers/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/docs/transformers/tests/models/layoutxlm/test_tokenization_layoutxlm.py new file mode 100644 index 0000000000000000000000000000000000000000..2bd76954d6dd4c7b3a6066050920595e6227132e --- /dev/null +++ b/docs/transformers/tests/models/layoutxlm/test_tokenization_layoutxlm.py @@ -0,0 +1,1953 @@ +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import shutil +import tempfile +import unittest + +from parameterized import parameterized + +from transformers import ( + AddedToken, + LayoutXLMTokenizerFast, + SpecialTokensMixin, + is_tf_available, + is_torch_available, + logging, +) +from transformers.models.layoutxlm.tokenization_layoutxlm import LayoutXLMTokenizer +from transformers.testing_utils import ( + get_tests_dir, + require_pandas, + require_sentencepiece, + require_tokenizers, + require_torch, + slow, +) + +from ...test_tokenization_common import ( + SMALL_TRAINING_CORPUS, + TokenizerTesterMixin, + filter_non_english, + merge_model_tokenizer_mappings, +) + + +logger = logging.get_logger(__name__) +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + + +@require_sentencepiece +@require_tokenizers +@require_pandas +class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "FacebookAI/xlm-roberta-base" + tokenizer_class = LayoutXLMTokenizer + rust_tokenizer_class = LayoutXLMTokenizerFast + test_rust_tokenizer = True + from_pretrained_filter = filter_non_english + test_seq2seq = False + test_sentencepiece = True + maxDiff = None + + def get_words_and_boxes(self): + words = ["a", "weirdly", "test"] + boxes = [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]] + + return words, boxes + + def get_words_and_boxes_batch(self): + words = [["a", "weirdly", "test"], ["hello", "my", "name", "is", "bob"]] + boxes = [ + [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]], + [[961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69]], + ] + + return words, boxes + + def get_question_words_and_boxes(self): + question = "what's his name?" + words = ["a", "weirdly", "test"] + boxes = [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]] + + return question, words, boxes + + def get_question_words_and_boxes_batch(self): + questions = ["what's his name?", "how is he called?"] + words = [["a", "weirdly", "test"], ["what", "a", "laif", "gastn"]] + boxes = [ + [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]], + [[256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69]], + ] + + return questions, words, boxes + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # We have a SentencePiece fixture for testing + tokenizer = LayoutXLMTokenizer(SAMPLE_VOCAB, keep_accents=True) + tokenizer.save_pretrained(cls.tmpdirname) + + def get_input_output_texts(self, tokenizer): + input_text = "UNwant\u00e9d,running" + output_text = "unwanted, running" + return input_text, output_text + + @unittest.skip(reason="Chat template tests don't play well with table/layout models.") + def test_chat_template_batched(self): + pass + + # override test in `test_tokenization_common.py` because of the required input format of the `__call__`` method of + # this tokenizer + def test_save_sentencepiece_tokenizer(self) -> None: + if not self.test_sentencepiece or not self.test_slow_tokenizer: + self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False") + # We want to verify that we will be able to save the tokenizer even if the original files that were used to + # build the tokenizer have been deleted in the meantime. + words, boxes = self.get_words_and_boxes() + + tokenizer_slow_1 = self.get_tokenizer() + encoding_tokenizer_slow_1 = tokenizer_slow_1( + words, + boxes=boxes, + ) + + tmpdirname_1 = tempfile.mkdtemp() + tmpdirname_2 = tempfile.mkdtemp() + + tokenizer_slow_1.save_pretrained(tmpdirname_1) + tokenizer_slow_2 = self.tokenizer_class.from_pretrained(tmpdirname_1) + encoding_tokenizer_slow_2 = tokenizer_slow_2( + words, + boxes=boxes, + ) + + shutil.rmtree(tmpdirname_1) + tokenizer_slow_2.save_pretrained(tmpdirname_2) + + tokenizer_slow_3 = self.tokenizer_class.from_pretrained(tmpdirname_2) + encoding_tokenizer_slow_3 = tokenizer_slow_3( + words, + boxes=boxes, + ) + shutil.rmtree(tmpdirname_2) + + self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2) + self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3) + + def test_split_special_tokens(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + special_token = "" + special_sentence = f"Hey this is a {special_token} token" + _, _, boxes = self.get_question_words_and_boxes() + + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_rust = self.get_rust_tokenizer( + pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs + ) + tokenizer_py = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs + ) + + py_tokens_output = tokenizer_py.tokenize(special_sentence) + rust_tokens_output = tokenizer_rust.tokenize(special_sentence) + + self.assertTrue(special_token not in py_tokens_output) + self.assertTrue(special_token not in rust_tokens_output) + + py_tokens_output_unsplit = tokenizer_py.tokenize(special_sentence, split_special_tokens=False) + rust_tokens_output_unsplit = tokenizer_rust.tokenize(special_sentence, split_special_tokens=False) + + self.assertTrue(special_token in py_tokens_output_unsplit) + self.assertTrue(special_token in rust_tokens_output_unsplit) + + tmpdirname = tempfile.mkdtemp() + tokenizer_py.save_pretrained(tmpdirname) + fast_from_saved = self.tokenizer_class.from_pretrained(tmpdirname) + + output_tokens_reloaded_split = fast_from_saved.tokenize(special_sentence) + self.assertTrue(special_token not in output_tokens_reloaded_split) + + output_tokens_reloaded_unsplit = fast_from_saved.tokenize(special_sentence, split_special_tokens=False) + self.assertTrue(special_token in output_tokens_reloaded_unsplit) + + @slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutxlm-base") + + question, words, boxes = self.get_question_words_and_boxes() + + text = tokenizer.encode( + question.split(), + boxes=[tokenizer.pad_token_box for _ in range(len(question.split()))], + add_special_tokens=False, + ) + text_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_pair == [0] + text + [2] + [2] + text_2 + [2] + + def test_offsets_with_special_characters(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + + words, boxes = self.get_words_and_boxes() + words[1] = tokenizer_r.mask_token + tokens = tokenizer_r.encode_plus( + words, + boxes=boxes, + return_attention_mask=False, + return_token_type_ids=False, + return_offsets_mapping=True, + add_special_tokens=True, + ) + + expected_results = [ + ((0, 0), tokenizer_r.cls_token), + ((0, 1), "▁a"), + ((0, 6), tokenizer_r.mask_token), + ((0, 4), "▁test"), + ((0, 0), tokenizer_r.sep_token), + ] + + self.assertEqual( + [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"]) + ) + self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"]) + + def test_add_special_tokens(self): + tokenizers: list[LayoutXLMTokenizer] = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + special_token = "[SPECIAL_TOKEN]" + special_token_box = [1000, 1000, 1000, 1000] + + tokenizer.add_special_tokens({"cls_token": special_token}) + encoded_special_token = tokenizer.encode( + [special_token], boxes=[special_token_box], add_special_tokens=False + ) + self.assertEqual(len(encoded_special_token), 1) + + decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True) + self.assertTrue(special_token not in decoded) + + def test_add_tokens_tokenizer(self): + tokenizers: list[LayoutXLMTokenizer] = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + vocab_size = tokenizer.vocab_size + all_size = len(tokenizer) + + self.assertNotEqual(vocab_size, 0) + + # We usually have added tokens from the start in tests because our vocab fixtures are + # smaller than the original vocabs - let's not assert this + # self.assertEqual(vocab_size, all_size) + + new_toks = ["aaaaa", "bbbbbb", "cccccccccdddddddd"] + added_toks = tokenizer.add_tokens(new_toks) + vocab_size_2 = tokenizer.vocab_size + all_size_2 = len(tokenizer) + + self.assertNotEqual(vocab_size_2, 0) + self.assertEqual(vocab_size, vocab_size_2) + self.assertEqual(added_toks, len(new_toks)) + self.assertEqual(all_size_2, all_size + len(new_toks)) + + words = "aaaaa bbbbbb low cccccccccdddddddd l".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + + tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + + self.assertGreaterEqual(len(tokens), 4) + self.assertGreater(tokens[0], tokenizer.vocab_size - 1) + self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) + + new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"} + added_toks_2 = tokenizer.add_special_tokens(new_toks_2) + vocab_size_3 = tokenizer.vocab_size + all_size_3 = len(tokenizer) + + self.assertNotEqual(vocab_size_3, 0) + self.assertEqual(vocab_size, vocab_size_3) + self.assertEqual(added_toks_2, len(new_toks_2)) + self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) + + words = ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + + tokens = tokenizer.encode( + words, + boxes=boxes, + add_special_tokens=False, + ) + + self.assertGreaterEqual(len(tokens), 6) + self.assertGreater(tokens[0], tokenizer.vocab_size - 1) + self.assertGreater(tokens[0], tokens[1]) + self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) + self.assertGreater(tokens[-2], tokens[-3]) + self.assertEqual(tokens[0], tokenizer.eos_token_id) + self.assertEqual(tokens[-2], tokenizer.pad_token_id) + + @require_tokenizers + def test_encode_decode_with_spaces(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)] + tokenizer.add_tokens(new_toks) + input = "[ABC][DEF][ABC][DEF]" + if self.space_between_special_tokens: + output = "[ABC] [DEF] [ABC] [DEF]" + else: + output = input + encoded = tokenizer.encode(input.split(), boxes=boxes, add_special_tokens=False) + decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens) + self.assertIn(decoded, [output, output.lower()]) + + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + padding_size = 10 + padding_idx = tokenizer.pad_token_id + + encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_special_tokens_mask=True) + input_ids = encoded_sequence["input_ids"] + special_tokens_mask = encoded_sequence["special_tokens_mask"] + sequence_length = len(input_ids) + + # Test 'longest' and 'no_padding' don't do anything + tokenizer.padding_side = "right" + + not_padded_sequence = tokenizer.encode_plus( + words, + boxes=boxes, + padding=False, + return_special_tokens_mask=True, + ) + not_padded_input_ids = not_padded_sequence["input_ids"] + + not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] + not_padded_sequence_length = len(not_padded_input_ids) + + self.assertTrue(sequence_length == not_padded_sequence_length) + self.assertTrue(input_ids == not_padded_input_ids) + self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) + + not_padded_sequence = tokenizer.encode_plus( + words, + boxes=boxes, + padding=False, + return_special_tokens_mask=True, + ) + not_padded_input_ids = not_padded_sequence["input_ids"] + + not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] + not_padded_sequence_length = len(not_padded_input_ids) + + self.assertTrue(sequence_length == not_padded_sequence_length) + self.assertTrue(input_ids == not_padded_input_ids) + self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) + + # Test right padding + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" + + right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right) + right_padded_input_ids = right_padded_sequence["input_ids"] + + right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] + right_padded_sequence_length = len(right_padded_input_ids) + + self.assertTrue(sequence_length + padding_size == right_padded_sequence_length) + self.assertTrue(input_ids + [padding_idx] * padding_size == right_padded_input_ids) + self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask) + + # Test left padding + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left) + left_padded_input_ids = left_padded_sequence["input_ids"] + left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] + left_padded_sequence_length = len(left_padded_input_ids) + + self.assertTrue(sequence_length + padding_size == left_padded_sequence_length) + self.assertTrue([padding_idx] * padding_size + input_ids == left_padded_input_ids) + self.assertTrue([1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask) + + if "token_type_ids" in tokenizer.model_input_names: + token_type_ids = encoded_sequence["token_type_ids"] + left_padded_token_type_ids = left_padded_sequence["token_type_ids"] + right_padded_token_type_ids = right_padded_sequence["token_type_ids"] + + assert token_type_ids + [0] * padding_size == right_padded_token_type_ids + assert [0] * padding_size + token_type_ids == left_padded_token_type_ids + + if "attention_mask" in tokenizer.model_input_names: + attention_mask = encoded_sequence["attention_mask"] + right_padded_attention_mask = right_padded_sequence["attention_mask"] + left_padded_attention_mask = left_padded_sequence["attention_mask"] + + self.assertTrue(attention_mask + [0] * padding_size == right_padded_attention_mask) + self.assertTrue([0] * padding_size + attention_mask == left_padded_attention_mask) + + def test_internal_consistency(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + tokens = [] + for word in words: + tokens.extend(tokenizer.tokenize(word)) + ids = tokenizer.convert_tokens_to_ids(tokens) + ids_2 = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + self.assertListEqual(ids, ids_2) + + tokens_2 = tokenizer.convert_ids_to_tokens(ids) + self.assertNotEqual(len(tokens_2), 0) + text_2 = tokenizer.decode(ids) + self.assertIsInstance(text_2, str) + + output_text = "a weirdly test" + self.assertEqual(text_2, output_text) + + def test_mask_output(self): + tokenizers = self.get_tokenizers(fast=False, do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + + if ( + tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer" + and "token_type_ids" in tokenizer.model_input_names + ): + information = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True) + sequences, mask = information["input_ids"], information["token_type_ids"] + self.assertEqual(len(sequences), len(mask)) + + def test_number_of_added_tokens(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # test 1: single sequence + words, boxes = self.get_words_and_boxes() + + sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + attached_sequences = tokenizer.encode(words, boxes=boxes, add_special_tokens=True) + + # Method is implemented (e.g. not GPT-2) + if len(attached_sequences) != 2: + self.assertEqual( + tokenizer.num_special_tokens_to_add(pair=False), len(attached_sequences) - len(sequences) + ) + + # test 2: two sequences + question, words, boxes = self.get_question_words_and_boxes() + + sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=False) + attached_sequences = tokenizer.encode(question, words, boxes=boxes, add_special_tokens=True) + + # Method is implemented (e.g. not GPT-2) + if len(attached_sequences) != 2: + self.assertEqual( + tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences) + ) + + def test_padding_to_max_length(self): + """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated""" + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + padding_size = 10 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + padding_idx = tokenizer.pad_token_id + + # Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "right" + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + # FIXME: the next line should be padding(max_length) to avoid warning + padded_sequence = tokenizer.encode( + words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True + ) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert encoded_sequence + [padding_idx] * padding_size == padded_sequence + + # Check that nothing is done when a maximum length is not specified + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True) + padded_sequence_right_length = len(padded_sequence_right) + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + + def test_padding(self, max_length=50): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + + self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) + pad_token_id = tokenizer_p.pad_token_id + + # Encode - Simple input + words, boxes = self.get_words_and_boxes() + input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length") + input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length") + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.encode(words, boxes=boxes, padding="longest") + input_p = tokenizer_p.encode(words, boxes=boxes, padding=True) + self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) + + # Encode - Pair input + question, words, boxes = self.get_question_words_and_boxes() + input_r = tokenizer_r.encode( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") + input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") + self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) + input_r = tokenizer_r.encode(question, words, boxes=boxes, padding=True) + input_p = tokenizer_p.encode(question, words, boxes=boxes, padding="longest") + self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) + + # Encode_plus - Simple input + words, boxes = self.get_words_and_boxes() + input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") + input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + input_r = tokenizer_r.encode_plus(words, boxes=boxes, padding="longest") + input_p = tokenizer_p.encode_plus(words, boxes=boxes, padding=True) + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + # Encode_plus - Pair input + question, words, boxes = self.get_question_words_and_boxes() + input_r = tokenizer_r.encode_plus( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode_plus( + question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True + ) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus( + question, words, boxes=boxes, max_length=max_length, padding="max_length" + ) + input_p = tokenizer_p.encode_plus( + question, words, boxes=boxes, max_length=max_length, padding="max_length" + ) + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus(question, words, boxes=boxes, padding="longest") + input_p = tokenizer_p.encode_plus(question, words, boxes=boxes, padding=True) + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + # Batch_encode_plus - Simple input + words, boxes = self.get_words_and_boxes_batch() + + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + pad_to_max_length=True, + ) + input_p = tokenizer_p.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + pad_to_max_length=True, + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding="max_length", + ) + input_p = tokenizer_p.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding="max_length", + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding="longest", + ) + input_p = tokenizer_p.batch_encode_plus( + words, + boxes=boxes, + max_length=max_length, + padding=True, + ) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes, padding="longest") + input_p = tokenizer_p.batch_encode_plus(words, boxes=boxes, padding=True) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Batch_encode_plus - Pair input + questions, words, boxes = self.get_question_words_and_boxes_batch() + + input_r = tokenizer_r.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + max_length=max_length, + truncation=True, + padding="max_length", + ) + input_p = tokenizer_p.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + max_length=max_length, + truncation=True, + padding="max_length", + ) + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + input_r = tokenizer_r.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + padding=True, + ) + input_p = tokenizer_p.batch_encode_plus( + list(zip(questions, words)), + is_pair=True, + boxes=boxes, + padding="longest", + ) + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Using pad on single examples after tokenization + words, boxes = self.get_words_and_boxes() + input_r = tokenizer_r.encode_plus(words, boxes=boxes) + input_r = tokenizer_r.pad(input_r) + + input_p = tokenizer_r.encode_plus(words, boxes=boxes) + input_p = tokenizer_r.pad(input_p) + + self.assert_padded_input_match( + input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id + ) + + # Using pad on single examples after tokenization + input_r = tokenizer_r.encode_plus(words, boxes=boxes) + input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") + + input_p = tokenizer_r.encode_plus(words, boxes=boxes) + input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") + + self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) + + # Using pad after tokenization + words, boxes = self.get_words_and_boxes_batch() + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_r = tokenizer_r.pad(input_r) + + input_p = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_p = tokenizer_r.pad(input_p) + + self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id) + + # Using pad after tokenization + words, boxes = self.get_words_and_boxes_batch() + input_r = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") + + input_p = tokenizer_r.batch_encode_plus( + words, + boxes=boxes, + ) + input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") + + self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) + + def test_padding_warning_message_fast_tokenizer(self): + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + words, boxes = self.get_words_and_boxes_batch() + + tokenizer_fast = self.get_rust_tokenizer() + + encoding_fast = tokenizer_fast( + words, + boxes=boxes, + ) + + with self.assertLogs("transformers", level="WARNING") as cm: + tokenizer_fast.pad(encoding_fast) + self.assertEqual(len(cm.records), 1) + self.assertIn( + "Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to" + " encode the text followed by a call to the `pad` method to get a padded encoding.", + cm.records[0].message, + ) + + if not self.test_slow_tokenizer: + self.skipTest(reason="test_slow_tokenizer is set to False") + + tokenizer_slow = self.get_tokenizer() + + encoding_slow = tokenizer_slow( + words, + boxes=boxes, + ) + + with self.assertLogs(level="WARNING") as cm: + # We want to assert there are no warnings, but the 'assertLogs' method does not support that. + # Therefore, we are adding a dummy warning, and then we will assert it is the only warning. + logger.warning("Dummy warning") + tokenizer_slow.pad(encoding_slow) + self.assertEqual(len(cm.records), 1) + self.assertIn( + "Dummy warning", + cm.records[0].message, + ) + + def test_call(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Test not batched + words, boxes = self.get_words_and_boxes() + encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes) + encoded_sequences_2 = tokenizer(words, boxes=boxes) + self.assertEqual(encoded_sequences_1, encoded_sequences_2) + + # Test not batched pairs + question, words, boxes = self.get_question_words_and_boxes() + encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes) + encoded_sequences_2 = tokenizer(words, boxes=boxes) + self.assertEqual(encoded_sequences_1, encoded_sequences_2) + + # Test batched + words, boxes = self.get_words_and_boxes_batch() + encoded_sequences_1 = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes) + encoded_sequences_2 = tokenizer(words, boxes=boxes) + self.assertEqual(encoded_sequences_1, encoded_sequences_2) + + def test_batch_encode_plus_batch_sequence_length(self): + # Tests that all encoded values have the correct size + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes_batch() + + encoded_sequences = [ + tokenizer.encode_plus(words_example, boxes=boxes_example) + for words_example, boxes_example in zip(words, boxes) + ] + encoded_sequences_batch = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes, padding=False) + self.assertListEqual( + encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch) + ) + + maximum_length = len( + max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len) + ) + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + encoded_sequences_padded = [ + tokenizer.encode_plus( + words_example, boxes=boxes_example, max_length=maximum_length, padding="max_length" + ) + for words_example, boxes_example in zip(words, boxes) + ] + + encoded_sequences_batch_padded = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, padding=True + ) + self.assertListEqual( + encoded_sequences_padded, + self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded), + ) + + # check 'longest' is unsensitive to a max length + encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, padding=True + ) + encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest" + ) + for key in encoded_sequences_batch_padded_1.keys(): + self.assertListEqual( + encoded_sequences_batch_padded_1[key], + encoded_sequences_batch_padded_2[key], + ) + + # check 'no_padding' is unsensitive to a max length + encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, padding=False + ) + encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False + ) + for key in encoded_sequences_batch_padded_1.keys(): + self.assertListEqual( + encoded_sequences_batch_padded_1[key], + encoded_sequences_batch_padded_2[key], + ) + + @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.") + def test_batch_encode_plus_overflowing_tokens(self): + pass + + def test_batch_encode_plus_padding(self): + # Test that padded sequences are equivalent between batch_encode_plus and encode_plus + + # Right padding tests + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes_batch() + + max_length = 100 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + encoded_sequences = [ + tokenizer.encode_plus( + words_example, boxes=boxes_example, max_length=max_length, padding="max_length" + ) + for words_example, boxes_example in zip(words, boxes) + ] + encoded_sequences_batch = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length" + ) + self.assertListEqual( + encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch) + ) + + # Left padding tests + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + tokenizer.padding_side = "left" + words, boxes = self.get_words_and_boxes_batch() + + max_length = 100 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, words) + + encoded_sequences = [ + tokenizer.encode_plus( + words_example, boxes=boxes_example, max_length=max_length, padding="max_length" + ) + for words_example, boxes_example in zip(words, boxes) + ] + encoded_sequences_batch = tokenizer.batch_encode_plus( + words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length" + ) + self.assertListEqual( + encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch) + ) + + def test_padding_to_multiple_of(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + if tokenizer.pad_token is None: + self.skipTest(reason="No padding token.") + else: + words, boxes = self.get_words_and_boxes() + + # empty_tokens = tokenizer([""], [[]], padding=True, pad_to_multiple_of=8) + normal_tokens = tokenizer(words, boxes=boxes, padding=True, pad_to_multiple_of=8) + # for key, value in empty_tokens.items(): + # self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + for key, value in normal_tokens.items(): + self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + normal_tokens = tokenizer(words, boxes=boxes, pad_to_multiple_of=8) + for key, value in normal_tokens.items(): + self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + # Should also work with truncation + normal_tokens = tokenizer(words, boxes=boxes, padding=True, truncation=True, pad_to_multiple_of=8) + for key, value in normal_tokens.items(): + self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8") + + # truncation to something which is not a multiple of pad_to_multiple_of raises an error + self.assertRaises( + ValueError, + tokenizer.__call__, + words, + boxes=boxes, + padding=True, + truncation=True, + max_length=12, + pad_to_multiple_of=8, + ) + + def test_tokenizer_slow_store_full_signature(self): + signature = inspect.signature(self.tokenizer_class.__init__) + tokenizer = self.get_tokenizer() + + for parameter_name, parameter in signature.parameters.items(): + if parameter.default != inspect.Parameter.empty: + self.assertIn(parameter_name, tokenizer.init_kwargs) + + def test_build_inputs_with_special_tokens(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_slow_tokenizer is set to False") + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + + # Input tokens id + words, boxes = self.get_words_and_boxes() + input_simple = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False) + input_pair = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False) + + # Generate output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) + self.assertEqual(output_p, output_r) + + # Generate pair output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) + self.assertEqual(output_p, output_r) + + def test_special_tokens_mask_input_pairs(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + encoded_sequence_dict = tokenizer.encode_plus( + words, + boxes=boxes, + add_special_tokens=True, + return_special_tokens_mask=True, + # add_prefix_space=False, + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + + filtered_sequence = [ + (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) + ] + filtered_sequence = [x for x in filtered_sequence if x is not None] + self.assertEqual(encoded_sequence, filtered_sequence) + + def test_special_tokens_mask(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + # Testing single inputs + encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + encoded_sequence_dict = tokenizer.encode_plus( + words, boxes=boxes, add_special_tokens=True, return_special_tokens_mask=True + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + + filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]] + self.assertEqual(encoded_sequence, filtered_sequence) + + def test_save_and_load_tokenizer(self): + # safety check on max_len default value so we are sure the test works + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + self.assertNotEqual(tokenizer.model_max_length, 42) + + # Now let's start the test + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Isolate this from the other tests because we save additional tokens/etc + words, boxes = self.get_words_and_boxes() + tmpdirname = tempfile.mkdtemp() + + before_tokens = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + before_vocab = tokenizer.get_vocab() + tokenizer.save_pretrained(tmpdirname) + + after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) + after_tokens = after_tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + after_vocab = after_tokenizer.get_vocab() + self.assertListEqual(before_tokens, after_tokens) + self.assertDictEqual(before_vocab, after_vocab) + + shutil.rmtree(tmpdirname) + + @unittest.skip(reason="Not implemented") + def test_right_and_left_truncation(self): + pass + + def test_right_and_left_padding(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + sequence = "Sequence" + padding_size = 10 + + # check correct behaviour if no pad_token_id exists and add it eventually + self._check_no_pad_token_padding(tokenizer, sequence) + + padding_idx = tokenizer.pad_token_id + + # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "right" + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode( + words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length" + ) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert encoded_sequence + [padding_idx] * padding_size == padded_sequence + + # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "left" + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode( + words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length" + ) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert [padding_idx] * padding_size + encoded_sequence == padded_sequence + + # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding' + encoded_sequence = tokenizer.encode(words, boxes=boxes) + sequence_length = len(encoded_sequence) + + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(words, boxes=boxes, padding=True) + padded_sequence_right_length = len(padded_sequence_right) + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + + tokenizer.padding_side = "left" + padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding="longest") + padded_sequence_left_length = len(padded_sequence_left) + assert sequence_length == padded_sequence_left_length + assert encoded_sequence == padded_sequence_left + + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(words, boxes=boxes) + padded_sequence_right_length = len(padded_sequence_right) + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + + tokenizer.padding_side = "left" + padded_sequence_left = tokenizer.encode(words, boxes=boxes, padding=False) + padded_sequence_left_length = len(padded_sequence_left) + assert sequence_length == padded_sequence_left_length + assert encoded_sequence == padded_sequence_left + + def test_token_type_ids(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # test 1: single sequence + words, boxes = self.get_words_and_boxes() + + output = tokenizer(words, boxes=boxes, return_token_type_ids=True) + + # Assert that the token type IDs have the same length as the input IDs + self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"])) + + # Assert that the token type IDs have the same length as the attention mask + self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"])) + + self.assertIn(0, output["token_type_ids"]) + self.assertNotIn(1, output["token_type_ids"]) + + # test 2: two sequences (question + words) + question, words, boxes = self.get_question_words_and_boxes() + + output = tokenizer(question, words, boxes, return_token_type_ids=True) + + # Assert that the token type IDs have the same length as the input IDs + self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"])) + + # Assert that the token type IDs have the same length as the attention mask + self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"])) + + self.assertIn(0, output["token_type_ids"]) + self.assertNotIn(1, output["token_type_ids"]) + + def test_offsets_mapping(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + text = ["a", "wonderful", "test"] + boxes = [[1, 8, 12, 20] for _ in range(len(text))] + + # No pair + tokens_with_offsets = tokenizer_r.encode_plus( + text, + boxes=boxes, + return_special_tokens_mask=True, + return_offsets_mapping=True, + add_special_tokens=True, + ) + added_tokens = tokenizer_r.num_special_tokens_to_add(False) + offsets = tokens_with_offsets["offset_mapping"] + + # Assert there is the same number of tokens and offsets + self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) + + # Assert there is online added_tokens special_tokens + self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) + + # Pairs + text = "what's his name" + pair = ["a", "wonderful", "test"] + boxes = [[1, 8, 12, 20] for _ in range(len(pair))] + tokens_with_offsets = tokenizer_r.encode_plus( + text, + pair, + boxes=boxes, + return_special_tokens_mask=True, + return_offsets_mapping=True, + add_special_tokens=True, + ) + added_tokens = tokenizer_r.num_special_tokens_to_add(True) + offsets = tokens_with_offsets["offset_mapping"] + + # Assert there is the same number of tokens and offsets + self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) + + # Assert there is online added_tokens special_tokens + self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) + + @require_torch + @slow + def test_torch_encode_plus_sent_to_model(self): + import torch + + from transformers import MODEL_MAPPING, TOKENIZER_MAPPING + + MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING) + + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: + self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING") + + config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__] + config = config_class() + + if config.is_encoder_decoder or config.pad_token_id is None: + self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.") + + model = model_class(config) + + # Make sure the model contains at least the full vocabulary size in its embedding matrix + is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight") + assert ( + (model.get_input_embeddings().weight.shape[0] >= len(tokenizer)) + if is_using_common_embeddings + else True + ) + + # Build sequence + words, boxes = self.get_words_and_boxes() + encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt") + batch_encoded_sequence = tokenizer.batch_encode_plus( + [words, words], [boxes, boxes], return_tensors="pt" + ) + # This should not fail + + with torch.no_grad(): # saves some time + model(**encoded_sequence) + model(**batch_encoded_sequence) + + def test_rust_and_python_full_tokenizers(self): + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_slow_tokenizer is set to False") + + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + words, boxes = self.get_words_and_boxes() + + ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=True) + rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True) + self.assertListEqual(ids, rust_ids) + + def test_tokenization_python_rust_equals(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_slow_tokenizer is set to False") + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + + words, boxes = self.get_words_and_boxes() + + # Ensure basic input match + input_p = tokenizer_p.encode_plus(words, boxes=boxes) + input_r = tokenizer_r.encode_plus(words, boxes=boxes) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_p[key], input_r[key]) + + input_pairs_p = tokenizer_p.encode_plus(words, boxes=boxes) + input_pairs_r = tokenizer_r.encode_plus(words, boxes=boxes) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key]) + + words = ["hello" for _ in range(1000)] + boxes = [[1000, 1000, 1000, 1000] for _ in range(1000)] + + # Ensure truncation match + input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=512, truncation=True) + input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=512, truncation=True) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_p[key], input_r[key]) + + # Ensure truncation with stride match + input_p = tokenizer_p.encode_plus( + words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True + ) + input_r = tokenizer_r.encode_plus( + words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True + ) + + for key in filter( + lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys() + ): + self.assertSequenceEqual(input_p[key], input_r[key][0]) + + def test_embeded_special_tokens(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_slow_tokenizer is set to False") + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + words, boxes = self.get_words_and_boxes() + tokens_r = tokenizer_r.encode_plus( + words, + boxes=boxes, + add_special_tokens=True, + ) + tokens_p = tokenizer_p.encode_plus( + words, + boxes=boxes, + add_special_tokens=True, + ) + + for key in tokens_p.keys(): + self.assertEqual(tokens_r[key], tokens_p[key]) + + if "token_type_ids" in tokens_r: + self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"])) + + tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"]) + tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"]) + self.assertSequenceEqual(tokens_r, tokens_p) + + def test_compare_add_special_tokens(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False) + + words, boxes = self.get_words_and_boxes() + # tokenize() + no_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=False) + with_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=True) + self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add) + + # encode() + no_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=False) + with_special_tokens = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=True) + self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add) + + # encode_plus() + no_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=False) + with_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=True) + for key in no_special_tokens.keys(): + self.assertEqual( + len(no_special_tokens[key]), + len(with_special_tokens[key]) - simple_num_special_tokens_to_add, + ) + + # # batch_encode_plus + words, boxes = self.get_words_and_boxes_batch() + + no_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=False) + with_special_tokens = tokenizer_r.batch_encode_plus(words, boxes=boxes, add_special_tokens=True) + for key in no_special_tokens.keys(): + for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]): + self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add) + + @slow + def test_layoutxlm_truncation_integration_test(self): + words, boxes = self.get_words_and_boxes() + + tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base", model_max_length=512) + + for i in range(12, 512): + new_encoded_inputs = tokenizer.encode(words, boxes=boxes, max_length=i, truncation=True) + + # Ensure that the input IDs are less than the max length defined. + self.assertLessEqual(len(new_encoded_inputs), i) + + tokenizer.model_max_length = 20 + new_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True) + dropped_encoded_inputs = tokenizer.encode(words, boxes=boxes, truncation=True) + + # Ensure that the input IDs are still truncated when no max_length is specified + self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs) + self.assertLessEqual(len(new_encoded_inputs), 20) + + def test_sequence_ids(self): + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + if not tokenizer.is_fast: + continue + with self.subTest(f"{tokenizer.__class__.__name__}"): + seq_0 = "Test this method." + seq_1 = ["With", "these", "inputs."] + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(seq_1))] + + # We want to have sequence 0 and sequence 1 are tagged + # respectively with 0 and 1 token_ids + # (regardless of whether the model use token type ids) + # We use this assumption in the QA pipeline among other place + output = tokenizer(seq_0.split(), boxes=boxes) + self.assertIn(0, output.sequence_ids()) + + output = tokenizer(seq_0, seq_1, boxes=boxes) + self.assertIn(0, output.sequence_ids()) + self.assertIn(1, output.sequence_ids()) + + if tokenizer.num_special_tokens_to_add(pair=True): + self.assertIn(None, output.sequence_ids()) + + def test_special_tokens_initialization(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + added_tokens = [AddedToken("", lstrip=True)] + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + words = "Hey this is a token".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + r_output = tokenizer_r.encode(words, boxes=boxes) + + special_token_id = tokenizer_r.encode( + [""], boxes=[1000, 1000, 1000, 1000], add_special_tokens=False + )[0] + + self.assertTrue(special_token_id in r_output) + + if self.test_slow_tokenizer: + tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True + ) + tokenizer_p = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + + words = "Hey this is a token".split() + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + + p_output = tokenizer_p.encode(words, boxes=boxes) + cr_output = tokenizer_cr.encode(words, boxes=boxes) + + self.assertEqual(p_output, r_output) + self.assertEqual(cr_output, r_output) + self.assertTrue(special_token_id in p_output) + self.assertTrue(special_token_id in cr_output) + + def test_training_new_tokenizer(self): + # This feature only exists for fast tokenizers + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_rust_tokenizer() + new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100) + + # Test we can use the new tokenizer with something not seen during training + text = [["this", "is", "the"], ["how", "are", "you"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8], [1, 3, 4, 8]], [[5, 6, 7, 8], [4, 5, 6, 7], [3, 9, 2, 7]]] + inputs = new_tokenizer(text, boxes=boxes) + self.assertEqual(len(inputs["input_ids"]), 2) + decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) + expected_result = "this is the" + + if tokenizer.backend_tokenizer.normalizer is not None: + expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result) + self.assertEqual(expected_result, decoded_input) + + # We check that the parameters of the tokenizer remained the same + # Check we have the same number of added_tokens for both pair and non-pair inputs. + self.assertEqual(tokenizer.num_special_tokens_to_add(False), new_tokenizer.num_special_tokens_to_add(False)) + self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True)) + + # Check we have the correct max_length for both pair and non-pair inputs. + self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence) + self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair) + + # Assert the set of special tokens match as we didn't ask to change them + self.assertSequenceEqual( + tokenizer.all_special_tokens_extended, + new_tokenizer.all_special_tokens_extended, + ) + + self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map) + + def test_training_new_tokenizer_with_special_tokens_change(self): + # This feature only exists for fast tokenizers + if not self.test_rust_tokenizer: + self.skipTest(reason="test_rust_tokenizer is set to False") + + tokenizer = self.get_rust_tokenizer() + # Test with a special tokens map + class_signature = inspect.signature(tokenizer.__class__) + if "cls_token" in class_signature.parameters: + new_tokenizer = tokenizer.train_new_from_iterator( + SMALL_TRAINING_CORPUS, 100, special_tokens_map={tokenizer.cls_token: ""} + ) + cls_id = new_tokenizer.get_vocab()[""] + self.assertEqual(new_tokenizer.cls_token, "") + self.assertEqual(new_tokenizer.cls_token_id, cls_id) + + # Create a new mapping from the special tokens defined in the original tokenizer + special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy() + special_tokens_list.remove("additional_special_tokens") + special_tokens_map = {} + for token in special_tokens_list: + # Get the private one to avoid unnecessary warnings. + if getattr(tokenizer, token) is not None: + special_token = getattr(tokenizer, token) + special_tokens_map[special_token] = f"{special_token}a" + + # Train new tokenizer + new_tokenizer = tokenizer.train_new_from_iterator( + SMALL_TRAINING_CORPUS, 100, special_tokens_map=special_tokens_map + ) + + # Check the changes + for token in special_tokens_list: + # Get the private one to avoid unnecessary warnings. + if getattr(tokenizer, token) is None: + continue + special_token = getattr(tokenizer, token) + if special_token in special_tokens_map: + new_special_token = getattr(new_tokenizer, token) + self.assertEqual(special_tokens_map[special_token], new_special_token) + + new_id = new_tokenizer.get_vocab()[new_special_token] + self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id) + + # Check if the AddedToken / string format has been kept + for special_token in tokenizer.all_special_tokens_extended: + if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map: + # The special token must appear identically in the list of the new tokenizer. + self.assertTrue( + special_token in new_tokenizer.all_special_tokens_extended, + f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}", + ) + elif isinstance(special_token, AddedToken): + # The special token must appear in the list of the new tokenizer as an object of type AddedToken with + # the same parameters as the old AddedToken except the content that the user has requested to change. + special_token_str = special_token.content + new_special_token_str = special_tokens_map[special_token_str] + + find = False + for candidate in new_tokenizer.all_special_tokens_extended: + if ( + isinstance(candidate, AddedToken) + and candidate.content == new_special_token_str + and candidate.lstrip == special_token.lstrip + and candidate.rstrip == special_token.rstrip + and candidate.normalized == special_token.normalized + and candidate.single_word == special_token.single_word + ): + find = True + break + self.assertTrue( + find, + f"'{new_special_token_str}' doesn't appear in the list " + f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as " + f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}", + ) + elif special_token not in special_tokens_map: + # The special token must appear identically in the list of the new tokenizer. + self.assertTrue( + special_token in new_tokenizer.all_special_tokens_extended, + f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}", + ) + + else: + # The special token must appear in the list of the new tokenizer as an object of type string. + self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended) + + # Test we can use the new tokenizer with something not seen during training + words = [["this", "is"], ["hello", "🤗"]] + boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[1, 2, 3, 4], [5, 6, 7, 8]]] + inputs = new_tokenizer(words, boxes=boxes) + self.assertEqual(len(inputs["input_ids"]), 2) + decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) + expected_result = "this is" + + if tokenizer.backend_tokenizer.normalizer is not None: + expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result) + self.assertEqual(expected_result, decoded_input) + + def test_prepare_for_model(self): + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + # only test prepare_for_model for the slow tokenizer + if tokenizer.__class__.__name__ == "LayoutXLMTokenizerFast": + continue + with self.subTest(f"{tokenizer.__class__.__name__}"): + words, boxes = self.get_words_and_boxes() + prepared_input_dict = tokenizer.prepare_for_model(words, boxes=boxes, add_special_tokens=True) + + input_dict = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True) + + self.assertEqual(input_dict, prepared_input_dict) + + def test_padding_different_model_input_name(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_slow_tokenizer is set to False") + + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) + pad_token_id = tokenizer_p.pad_token_id + + words, boxes = self.get_words_and_boxes_batch() + + input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes) + input_p = tokenizer_r.batch_encode_plus(words, boxes=boxes) + + # rename encoded batch to "inputs" + input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]] + del input_r[tokenizer_r.model_input_names[0]] + + input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]] + del input_p[tokenizer_p.model_input_names[0]] + + # Renaming `input_ids` to `inputs` + tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:] + tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:] + + input_r = tokenizer_r.pad(input_r, padding="longest") + input_p = tokenizer_r.pad(input_p, padding="longest") + + max_length = len(input_p["inputs"][0]) + self.assert_batch_padded_input_match( + input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs" + ) + + def test_batch_encode_dynamic_overflowing(self): + """ + When calling batch_encode with multiple sequences, it can return different number of + overflowing encoding for each sequence: + [ + Sequence 1: [Encoding 1, Encoding 2], + Sequence 2: [Encoding 1], + Sequence 3: [Encoding 1, Encoding 2, ... Encoding N] + ] + This needs to be padded so that it can represented as a tensor + """ + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"): + if is_torch_available(): + returned_tensor = "pt" + elif is_tf_available(): + returned_tensor = "tf" + else: + returned_tensor = "jax" + + # Single example + words, boxes = self.get_words_and_boxes() + tokens = tokenizer.encode_plus( + words, + boxes=boxes, + max_length=6, + padding=True, + truncation=True, + return_tensors=returned_tensor, + return_overflowing_tokens=True, + ) + + for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): + if key != "bbox": + self.assertEqual(len(tokens[key].shape), 2) + else: + self.assertEqual(len(tokens[key].shape), 3) + + # Batch of examples + # For these 2 examples, 3 training examples will be created + words, boxes = self.get_words_and_boxes_batch() + tokens = tokenizer.batch_encode_plus( + words, + boxes=boxes, + max_length=6, + padding=True, + truncation="only_first", + return_tensors=returned_tensor, + return_overflowing_tokens=True, + ) + + for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): + if key != "bbox": + self.assertEqual(len(tokens[key].shape), 2) + self.assertEqual(tokens[key].shape[-1], 6) + else: + self.assertEqual(len(tokens[key].shape), 3) + self.assertEqual(tokens[key].shape[-1], 4) + + # overwrite from test_tokenization_common to speed up test + def test_save_pretrained(self): + if not self.test_slow_tokenizer: + # as we don't have a slow version, we can't compare the outputs between slow and fast versions + self.skipTest(reason="test_slow_tokenizer is set to False") + + self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-layoutxlm", {}) + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + the tokenizer.json file for the fast one + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f) + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key)) + # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id")) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=True + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=False + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it saved the tokenizer.json file + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + @unittest.skip(reason="TO DO: overwrite this very extensive test.") + def test_alignement_methods(self): + pass + + @unittest.skip(reason="layoutxlm tokenizer requires boxes besides sequences.") + def test_maximum_encoding_length_pair_input(self): + pass + + @unittest.skip(reason="layoutxlm tokenizer requires boxes besides sequences.") + def test_maximum_encoding_length_single_input(self): + pass + + @unittest.skip(reason="layoutxlm tokenizer requires boxes besides sequences.") + def test_pretokenized_inputs(self): + pass + + @unittest.skip(reason="layoutxlm tokenizer always expects pretokenized inputs.") + def test_compare_pretokenized_inputs(self): + pass + + @unittest.skip(reason="layoutxlm fast tokenizer does not support prepare_for_model") + def test_compare_prepare_for_model(self): + pass + + @slow + def test_only_label_first_subword(self): + words = ["hello", "niels"] + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))] + word_labels = [0, 1] + + # test slow tokenizer + tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base") + encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, -100, 1, -100, -100]) + + tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base", only_label_first_subword=False) + encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, 0, 1, 1, -100]) + + # test fast tokenizer + tokenizer_r = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base") + encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, -100, 1, -100, -100]) + + tokenizer_r = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base", only_label_first_subword=False) + encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels) + self.assertListEqual(encoding.labels, [-100, 0, 0, 1, 1, -100]) + + @slow + def test_layoutxlm_integration_test(self): + tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base") + tokenizer_r = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base") + + # There are 3 cases: + # CASE 1: document image classification (training + inference), document image token classification (inference), + # in which case only words and normalized bounding boxes are provided to the tokenizer + # CASE 2: document image token classification (training), + # in which case one also provides word labels to the tokenizer + # CASE 3: document image visual question answering (inference), + # in which case one also provides a question to the tokenizer + + # We need to test all 3 cases both on batched and non-batched inputs. + + # CASE 1: not batched + words, boxes = self.get_words_and_boxes() + + expected_results = {'input_ids': [0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # fmt: skip + + encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 1: batched + words, boxes = self.get_words_and_boxes_batch() + + expected_results = {'input_ids': [[0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 33600, 31, 759, 9351, 83, 21895, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip + + encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 2: not batched + words, boxes = self.get_words_and_boxes() + word_labels = [1, 2, 3] + + expected_results = {'input_ids': [0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'labels': [-100, 1, 2, -100, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]} # fmt: skip + + encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 2: batched + words, boxes = self.get_words_and_boxes_batch() + word_labels = [[1, 2, 3], [2, 46, 17, 22, 3]] + + expected_results = {'input_ids': [[0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 33600, 31, 759, 9351, 83, 21895, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'labels': [[-100, 1, 2, -100, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [-100, 2, -100, 46, 17, 22, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} # fmt: skip + + encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 3: not batched + question, words, boxes = self.get_question_words_and_boxes() + + expected_results = {'input_ids': [0, 2367, 25, 7, 1919, 9351, 32, 2, 2, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], 'bbox': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]} # fmt: skip + + encoding_p = tokenizer_p(question, words, boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(question, words, boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + # CASE 3: batched + questions, words, boxes = self.get_question_words_and_boxes_batch() + + expected_results = {'input_ids': [[0, 2367, 25, 7, 1919, 9351, 32, 2, 2, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1], [0, 3642, 83, 764, 35839, 32, 2, 2, 2367, 10, 21, 3190, 53496, 19, 2, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]], 'bbox': [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [336, 42, 353, 57], [34, 42, 66, 69], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]} # fmt: skip + encoding_p = tokenizer_p(questions, words, boxes, padding="max_length", max_length=20) + encoding_r = tokenizer_r(questions, words, boxes, padding="max_length", max_length=20) + self.assertDictEqual(dict(encoding_p), expected_results) + self.assertDictEqual(dict(encoding_r), expected_results) + + @unittest.skip(reason="Doesn't support another framework than PyTorch") + def test_np_encode_plus_sent_to_model(self): + pass + + @unittest.skip(reason="Doesn't use SentencePiece") + def test_sentencepiece_tokenize_and_convert_tokens_to_string(self): + pass + + @unittest.skip(reason="Doesn't use SentencePiece") + def test_sentencepiece_tokenize_and_decode(self): + pass + + @unittest.skip(reason="Chat is not supported") + def test_chat_template(self): + pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask(self): + pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask_truncated(self): + pass diff --git a/docs/transformers/tests/models/led/__init__.py b/docs/transformers/tests/models/led/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/led/test_modeling_led.py b/docs/transformers/tests/models/led/test_modeling_led.py new file mode 100644 index 0000000000000000000000000000000000000000..a80543ef258b4510a51ddfb44c57e2ab02941cf6 --- /dev/null +++ b/docs/transformers/tests/models/led/test_modeling_led.py @@ -0,0 +1,607 @@ +# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch LED model.""" + +import copy +import tempfile +import unittest + +from transformers import LEDConfig, is_torch_available +from transformers.models.auto import get_values +from transformers.testing_utils import ( + require_sentencepiece, + require_tokenizers, + require_torch, + require_torch_fp16, + slow, + torch_device, +) +from transformers.utils import cached_property + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + MODEL_FOR_QUESTION_ANSWERING_MAPPING, + LEDForConditionalGeneration, + LEDForQuestionAnswering, + LEDForSequenceClassification, + LEDModel, + LEDTokenizer, + ) + from transformers.models.led.modeling_led import LEDDecoder, LEDEncoder + + +def prepare_led_inputs_dict( + config, + input_ids, + decoder_input_ids, + attention_mask=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, +): + if attention_mask is None: + attention_mask = input_ids.ne(config.pad_token_id) + if decoder_attention_mask is None: + decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id) + if head_mask is None: + head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device) + if decoder_head_mask is None: + decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device) + if cross_attn_head_mask is None: + cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device) + return { + "input_ids": input_ids, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + } + + +class LEDModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=11, + is_training=True, + use_labels=False, + vocab_size=99, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=32, + eos_token_id=2, + pad_token_id=1, + bos_token_id=0, + attention_window=4, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.attention_window = attention_window + + # `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size + # [num_attention_heads, encoder_seq_length, encoder_key_length], but LongformerSelfAttention + # returns attention of shape [num_attention_heads, encoder_seq_length, self.attention_window + 1] + # because its local attention only attends to `self.attention_window + 1` locations + # (assuming no token with global attention, otherwise the last dimension of attentions + # is x + self.attention_window + 1, where x is the number of tokens with global attention) + # x is set to 1 + self.encoder_key_length = self.attention_window + 2 + + # because of padding `encoder_seq_length`, is different from `seq_length`. Relevant for + # the `test_attention_outputs` and `test_hidden_states_output` tests + self.encoder_seq_length = self.seq_length + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp( + 3, + ) + input_ids[:, -1] = self.eos_token_id # Eos Token + + decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + config = self.get_config() + inputs_dict = prepare_led_inputs_dict(config, input_ids, decoder_input_ids) + return config, inputs_dict + + def get_config(self): + return LEDConfig( + vocab_size=self.vocab_size, + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + eos_token_id=self.eos_token_id, + bos_token_id=self.bos_token_id, + pad_token_id=self.pad_token_id, + attention_window=self.attention_window, + ) + + def get_pipeline_config(self): + config = self.get_config() + config.max_position_embeddings = 100 + config.vocab_size = 300 + return config + + def prepare_config_and_inputs_for_common(self): + config, inputs_dict = self.prepare_config_and_inputs() + global_attention_mask = torch.zeros_like(inputs_dict["input_ids"]) + global_attention_mask[:, -1] = 1 + inputs_dict["global_attention_mask"] = global_attention_mask + + return config, inputs_dict + + def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict): + model = LEDModel(config=config).get_decoder().to(torch_device).eval() + input_ids = inputs_dict["input_ids"] + attention_mask = inputs_dict["attention_mask"] + head_mask = inputs_dict["head_mask"] + + # first forward pass + outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True) + + output, past_key_values = outputs.to_tuple() + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_attn_mask = ids_tensor((self.batch_size, 3), 2) + + # append to next input_ids and + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1) + + output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"] + output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[ + "last_hidden_state" + ] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2)) + + def check_encoder_decoder_model_standalone(self, config, inputs_dict): + model = LEDModel(config=config).to(torch_device).eval() + outputs = model(**inputs_dict) + + encoder_last_hidden_state = outputs.encoder_last_hidden_state + last_hidden_state = outputs.last_hidden_state + + with tempfile.TemporaryDirectory() as tmpdirname: + encoder = model.get_encoder() + encoder.save_pretrained(tmpdirname) + encoder = LEDEncoder.from_pretrained(tmpdirname).to(torch_device) + + encoder_last_hidden_state_2 = encoder( + inputs_dict["input_ids"], + attention_mask=inputs_dict["attention_mask"], + global_attention_mask=inputs_dict["global_attention_mask"], + )[0] + + self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3) + + with tempfile.TemporaryDirectory() as tmpdirname: + decoder = model.get_decoder() + decoder.save_pretrained(tmpdirname) + decoder = LEDDecoder.from_pretrained(tmpdirname).to(torch_device) + + last_hidden_state_2 = decoder( + input_ids=inputs_dict["decoder_input_ids"], + attention_mask=inputs_dict["decoder_attention_mask"], + encoder_hidden_states=encoder_last_hidden_state, + encoder_attention_mask=inputs_dict["attention_mask"], + )[0] + + self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3) + + def check_global_attention(self, config, inputs_dict): + model = LEDModel(config=config).to(torch_device).eval() + model.config.output_attentions = True + attention_mask = ids_tensor(inputs_dict["input_ids"].shape, vocab_size=2) + global_attention_mask = torch.zeros_like(attention_mask) + + # set some tokens to global_attention + num_tokens_with_global_attention = 2 + + attention_mask[:, 2 : 2 + num_tokens_with_global_attention] = 1 + global_attention_mask[:, 2 : 2 + num_tokens_with_global_attention] = 1 + inputs_dict["attention_mask"] = attention_mask + inputs_dict["global_attention_mask"] = global_attention_mask + + outputs = model(**inputs_dict) + self.parent.assertIsNotNone(outputs.encoder_global_attentions) + + # setting `num_tokens_with_global_attention` to global_attentions yields + # makes last dim to be of `num_tokens_with_global_attention` + self.parent.assertTrue( + outputs.encoder_global_attentions[0].shape, + (self.batch_size, self.num_attention_heads, self.encoder_seq_length, num_tokens_with_global_attention), + ) + + +@require_torch +class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + (LEDModel, LEDForConditionalGeneration, LEDForSequenceClassification, LEDForQuestionAnswering) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": LEDModel, + "question-answering": LEDForQuestionAnswering, + "summarization": LEDForConditionalGeneration, + "text-classification": LEDForSequenceClassification, + "text2text-generation": LEDForConditionalGeneration, + "translation": LEDForConditionalGeneration, + "zero-shot": LEDForSequenceClassification, + } + if is_torch_available() + else {} + ) + is_encoder_decoder = True + test_pruning = False + test_missing_keys = False + test_torchscript = False + + # TODO: Fix the failed tests when this model gets more usage + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"): + return True + + return False + + def setUp(self): + self.model_tester = LEDModelTester(self) + self.config_tester = ConfigTester(self, config_class=LEDConfig) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_save_load_strict(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs() + for model_class in self.all_model_classes: + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) + self.assertEqual(info["missing_keys"], []) + + def test_decoder_model_past_with_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) + + def test_encoder_decoder_model_standalone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() + self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs) + + def test_global_attention(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() + self.model_tester.check_global_attention(*config_and_inputs) + + def prepare_config_and_inputs_for_generate(self, *args, **kwargs): + config, inputs_dict = super().prepare_config_and_inputs_for_generate(*args, **kwargs) + # LED computes attention scores based on mask indices if `is_global` + inputs_dict.pop("global_attention_mask") + return config, inputs_dict + + # LEDForSequenceClassification does not support inputs_embeds + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in (LEDModel, LEDForConditionalGeneration, LEDForQuestionAnswering): + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + + if not self.is_encoder_decoder: + input_ids = inputs["input_ids"] + del inputs["input_ids"] + else: + encoder_input_ids = inputs["input_ids"] + decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) + del inputs["input_ids"] + inputs.pop("decoder_input_ids", None) + + wte = model.get_input_embeddings() + if not self.is_encoder_decoder: + inputs["inputs_embeds"] = wte(input_ids) + else: + inputs["inputs_embeds"] = wte(encoder_input_ids) + inputs["decoder_inputs_embeds"] = wte(decoder_input_ids) + + with torch.no_grad(): + model(**inputs)[0] + + @require_torch_fp16 + def test_generate_fp16(self): + config, input_dict = self.model_tester.prepare_config_and_inputs() + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + model = LEDForConditionalGeneration(config).eval().to(torch_device) + model.half() + model.generate(input_ids, attention_mask=attention_mask) + model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3) + + @unittest.skip(reason="Longformer cannot keep gradients in attentions or hidden states") + def test_retain_grad_hidden_states_attentions(self): + return + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + seq_length = self.model_tester.seq_length + encoder_seq_length = self.model_tester.encoder_seq_length + encoder_key_length = self.model_tester.encoder_key_length + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + out_len = len(outputs) + + # global attention outputs are added as well => so +1 here + correct_outlen = 6 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Question Answering model returns start_logits and end_logits + if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING): + correct_outlen += 1 # start_logits and end_logits instead of only 1 output + if "past_key_values" in outputs: + correct_outlen += 1 # past_key_values have been returned + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_length, seq_length], + ) + + # cross attentions + cross_attentions = outputs.cross_attentions + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + seq_length, + seq_length, + ], + ) + + def _check_encoder_attention_for_generate(self, attentions, batch_size, config, prompt_length): + # overwrite because LED does not have (bs, num_heads, seq_len, seq_len) shape + encoder_expected_shape = ( + batch_size, + config.num_attention_heads, + prompt_length, + self.model_tester.attention_window // 2 * 2 + 1, + ) + self.assertIsInstance(attentions, tuple) + self.assertListEqual( + [layer_attentions.shape for layer_attentions in attentions], + [encoder_expected_shape] * len(attentions), + ) + + +def assert_tensors_close(a, b, atol=1e-12, prefix=""): + """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error.""" + if a is None and b is None: + return True + try: + if torch.allclose(a, b, atol=atol): + return True + raise + except Exception: + pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item() + if a.numel() > 100: + msg = f"tensor values are {pct_different:.1%} percent different." + else: + msg = f"{a} != {b}" + if prefix: + msg = prefix + ": " + msg + raise AssertionError(msg) + + +def _long_tensor(tok_lst): + return torch.tensor(tok_lst, dtype=torch.long, device=torch_device) + + +TOLERANCE = 1e-4 + + +@require_torch +@require_sentencepiece +@require_tokenizers +@slow +class LEDModelIntegrationTests(unittest.TestCase): + """All the below results were obtained with the original checkpoints and code + base from https://github.com/allenai/longformer. + IMPORTANT: Note that the original checkpoints include a `postion_embeddings` "hack" + and have to be cut to have the correct shape. + See: https://github.com/huggingface/transformers/pull/9278#issue-544709661. + """ + + @cached_property + def default_tokenizer(self): + return LEDTokenizer.from_pretrained("allenai/led-base-16384") + + def test_inference_no_head(self): + model = LEDModel.from_pretrained("allenai/led-base-16384").to(torch_device) + + # change to intended input + input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]]) + decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]]) + inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids) + with torch.no_grad(): + output = model(**inputs_dict).last_hidden_state + expected_shape = torch.Size((1, 1024, 768)) + self.assertEqual(output.shape, expected_shape) + # change to expected output here + expected_slice = torch.tensor( + [[2.3050, 2.8279, 0.6531], [-1.8457, -0.1455, -3.5661], [-1.0186, 0.4586, -2.2043]], device=torch_device + ) + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=TOLERANCE, atol=TOLERANCE) + + def test_inference_head(self): + model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384").to(torch_device) + + # change to intended input + input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]]) + decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]]) + inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids) + with torch.no_grad(): + output = model(**inputs_dict, use_cache=False).logits + expected_shape = torch.Size((1, 1024, model.config.vocab_size)) + self.assertEqual(output.shape, expected_shape) + # change to expected output here + expected_slice = torch.tensor( + [[33.6507, 6.4572, 16.8089], [5.8739, -2.4238, 11.2902], [-3.2139, -4.3149, 4.2783]], device=torch_device + ) + torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=TOLERANCE, atol=TOLERANCE) + + def test_seq_to_seq_generation(self): + # this test requires 16GB of RAM + hf = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv").to(torch_device) + tok = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv") + + ARTICLE_LEP = r"""the lep experiments at the resonance of @xmath1-boson have tested the standard model ( sm ) at quantum level , measuring the @xmath1-decay into fermion pairs with an accuracy of one part in ten thousands . the good agreement of the lep data with the sm predictions have severely constrained the behavior of new physics at the @xmath1-pole . taking these achievements into account one can imagine that the physics of @xmath1-boson will again play the central role in the frontier of particle physics if the next generation @xmath1 factory comes true with the generated @xmath1 events several orders of magnitude higher than that of the lep . this factory can be realized in the gigaz option of the international linear collider ( ilc)@xcite . the ilc is a proposed electron - positron collider with tunable energy ranging from @xmath12 to @xmath13 and polarized beams in its first phase , and the gigaz option corresponds to its operation on top of the resonance of @xmath1 boson by adding a bypass to its main beam line . given the high luminosity , @xmath14 , and the cross section at the resonance of @xmath1 boson , @xmath15 , about @xmath16 @xmath1 events can be generated in an operational year of @xmath17 of gigaz , which implies that the expected sensitivity to the branching ratio of @xmath1-decay can be improved from @xmath18 at the lep to @xmath19 at the gigaz@xcite . in light of this , the @xmath1-boson properties , especially its exotic or rare decays which are widely believed to be sensitive to new physics , should be investigated comprehensively to evaluate their potential in probing new physics . among the rare @xmath1-decays , the flavor changing ( fc ) processes were most extensively studied to explore the flavor texture in new physics @xcite , and it was found that , although these processes are severely suppressed in the sm , their branching ratios in new physics models can be greatly enhanced to @xmath19 for lepton flavor violation decays @xcite and @xmath20 for quark flavor violation decays @xcite . besides the fc processes , the @xmath1-decay into light higgs boson(s ) is another type of rare process that was widely studied , e.g. the decay @xmath21 ( @xmath22 ) with the particle @xmath0 denoting a light higgs boson was studied in @xcite , the decay @xmath23 was studied in the two higgs doublet model ( 2hdm)@xcite and the minimal supersymmetric standard model ( mssm)@xcite , and the decay @xmath4 was studied in a model independent way @xcite , in 2hdm@xcite and also in mssm@xcite . these studies indicate that , in contrast with the kinematic forbidden of these decays in the sm , the rates of these decays can be as large as @xmath18 in new physics models , which lie within the expected sensitivity of the gigaz . in this work , we extend the previous studies of these decays to some new models and investigate these decays altogether . we are motivated by some recent studies on the singlet extension of the mssm , such as the next - to - minimal supersymmetric standard model ( nmssm ) @xcite and the nearly minimal supersymmetric standard model ( nmssm ) @xcite , where a light cp - odd higgs boson @xmath0 with singlet - dominant component may naturally arise from the spontaneous breaking of some approximate global symmetry like @xmath24 or peccei - quuin symmetry @xcite . these non - minimal supersymmetric models can not only avoid the @xmath25-problem , but also alleviate the little hierarchy by having such a light higgs boson @xmath0 @xcite . we are also motivated by that , with the latest experiments , the properties of the light higgs boson are more stringently constrained than before . so it is worth updating the previous studies . so far there is no model - independent lower bound on the lightest higgs boson mass . in the sm , it must be heavier than @xmath26 gev , obtained from the null observation of the higgs boson at lep experiments . however , due to the more complex structure of the higgs sector in the extensions of the sm , this lower bound can be significantly relaxed according to recent studies , e.g. , for the cp - odd higgs boson @xmath0 we have @xmath27 gev in the nmssm @xcite , @xmath28 gev in the nmssm @xcite , and @xmath29 gev in the lepton - specific 2hdm ( l2hdm ) @xcite . with such a light cp - odd higgs boson , the z - decay into one or more @xmath0 is open up . noting that the decay @xmath30 is forbidden due to bose symmetry , we in this work study the rare @xmath1-decays @xmath6 ( @xmath22 ) , @xmath31 and @xmath4 in a comparative way for four models , namely the type - ii 2hdm@xcite , the l2hdm @xcite , the nmssm and the nmssm . in our study , we examine carefully the constraints on the light @xmath0 from many latest experimental results . this work is organized as follows . in sec . ii we briefly describe the four new physics models . in sec . iii we present the calculations of the rare @xmath1-decays . in sec . iv we list the constraints on the four new physics models . in sec . v we show the numerical results for the branching ratios of the rare @xmath1-decays in various models . finally , the conclusion is given in sec . as the most economical way , the sm utilizes one higgs doublet to break the electroweak symmetry . as a result , the sm predicts only one physical higgs boson with its properties totally determined by two free parameters . in new physics models , the higgs sector is usually extended by adding higgs doublets and/or singlets , and consequently , more physical higgs bosons are predicted along with more free parameters involved in . the general 2hdm contains two @xmath32 doublet higgs fields @xmath33 and @xmath34 , and with the assumption of cp - conserving , its scalar potential can be parameterized as@xcite : @xmath35,\end{aligned}\ ] ] where @xmath36 ( @xmath37 ) are free dimensionless parameters , and @xmath38 ( @xmath39 ) are the parameters with mass dimension . after the electroweak symmetry breaking , the spectrum of this higgs sector includes three massless goldstone modes , which become the longitudinal modes of @xmath40 and @xmath1 bosons , and five massive physical states : two cp - even higgs bosons @xmath41 and @xmath42 , one neutral cp - odd higgs particle @xmath0 and a pair of charged higgs bosons @xmath43 . noting the constraint @xmath44 with @xmath45 and @xmath46 denoting the vacuum expectation values ( vev ) of @xmath33 and @xmath34 respectively , we choose @xmath47 as the input parameters with @xmath48 , and @xmath49 being the mixing angle that diagonalizes the mass matrix of the cp - even higgs fields . the difference between the type - ii 2hdm and the l2hdm comes from the yukawa coupling of the higgs bosons to quark / lepton . in the type - ii 2hdm , one higgs doublet @xmath34 generates the masses of up - type quarks and the other doublet @xmath33 generates the masses of down - type quarks and charged leptons ; while in the l2hdm one higgs doublet @xmath33 couples only to leptons and the other doublet @xmath34 couples only to quarks . so the yukawa interactions of @xmath0 to fermions in these two models are given by @xcite @xmath50 with @xmath51 denoting generation index . obviously , in the type - ii 2hdm the @xmath52 coupling and the @xmath53 coupling can be simultaneously enhanced by @xmath54 , while in the l2hdm only the @xmath53 coupling is enhanced by @xmath55 . the structures of the nmssm and the nmssm are described by their superpotentials and corresponding soft - breaking terms , which are given by @xcite @xmath56 where @xmath57 is the superpotential of the mssm without the @xmath25 term , @xmath58 and @xmath59 are higgs doublet and singlet superfields with @xmath60 and @xmath61 being their scalar component respectively , @xmath62 , @xmath63 , @xmath64 , @xmath65 , @xmath66 and @xmath67 are soft breaking parameters , and @xmath68 and @xmath69 are coefficients of the higgs self interactions . with the superpotentials and the soft - breaking terms , one can get the higgs potentials of the nmssm and the nmssm respectively . like the 2hdm , the higgs bosons with same cp property will mix and the mass eigenstates are obtained by diagonalizing the corresponding mass matrices : @xmath70 where the fields on the right hands of the equations are component fields of @xmath71 , @xmath72 and @xmath61 defined by @xmath73 @xmath74 and @xmath75 are respectively the cp - even and cp - odd neutral higgs bosons , @xmath76 and @xmath77 are goldstone bosons eaten by @xmath1 and @xmath78 , and @xmath79 is the charged higgs boson . so both the nmssm and nmssm predict three cp - even higgs bosons , two cp - odd higgs bosons and one pair of charged higgs bosons . in general , the lighter cp - odd higgs @xmath0 in these model is the mixture of the singlet field @xmath80 and the doublet field combination , @xmath81 , i.e. @xmath82 and its couplings to down - type quarks are then proportional to @xmath83 . so for singlet dominated @xmath0 , @xmath84 is small and the couplings are suppressed . as a comparison , the interactions of @xmath0 with the squarks are given by@xcite @xmath85 i.e. the interaction does not vanish when @xmath86 approaches zero . just like the 2hdm where we use the vevs of the higgs fields as fundamental parameters , we choose @xmath68 , @xmath69 , @xmath87 , @xmath88 , @xmath66 and @xmath89 as input parameters for the nmssm@xcite and @xmath68 , @xmath54 , @xmath88 , @xmath65 , @xmath90 and @xmath91 as input parameters for the nmssm@xcite . about the nmssm and the nmssm , three points should be noted . the first is for the two models , there is no explicit @xmath92term , and the effective @xmath25 parameter ( @xmath93 ) is generated when the scalar component of @xmath59 develops a vev . the second is , the nmssm is actually same as the nmssm with @xmath94@xcite , because the tadpole terms @xmath95 and its soft breaking term @xmath96 in the nmssm do not induce any interactions , except for the tree - level higgs boson masses and the minimization conditions . and the last is despite of the similarities , the nmssm has its own peculiarity , which comes from its neutralino sector . in the basis @xmath97 , its neutralino mass matrix is given by @xcite @xmath98 where @xmath99 and @xmath100 are @xmath101 and @xmath102 gaugino masses respectively , @xmath103 , @xmath104 , @xmath105 and @xmath106 . after diagonalizing this matrix one can get the mass eigenstate of the lightest neutralino @xmath107 with mass taking the following form @xcite @xmath108 this expression implies that @xmath107 must be lighter than about @xmath109 gev for @xmath110 ( from lower bound on chargnio mass ) and @xmath111 ( perturbativity bound ) . like the other supersymmetric models , @xmath107 as the lightest sparticle acts as the dark matter in the universe , but due to its singlino - dominated nature , it is difficult to annihilate sufficiently to get the correct density in the current universe . so the relic density of @xmath107 plays a crucial way in selecting the model parameters . for example , as shown in @xcite , for @xmath112 , there is no way to get the correct relic density , and for the other cases , @xmath107 mainly annihilates by exchanging @xmath1 boson for @xmath113 , or by exchanging a light cp - odd higgs boson @xmath0 with mass satisfying the relation @xmath114 for @xmath115 . for the annihilation , @xmath54 and @xmath25 are required to be less than 10 and @xmath116 respectively because through eq.([mass - exp ] ) a large @xmath87 or @xmath25 will suppress @xmath117 to make the annihilation more difficult . the properties of the lightest cp - odd higgs boson @xmath0 , such as its mass and couplings , are also limited tightly since @xmath0 plays an important role in @xmath107 annihilation . the phenomenology of the nmssm is also rather special , and this was discussed in detail in @xcite . in the type - ii 2hdm , l2hdm , nmssm and nmssm , the rare @xmath1-decays @xmath118 ( @xmath22 ) , @xmath3 and @xmath4 may proceed by the feynman diagrams shown in fig.[fig1 ] , fig.[fig2 ] and fig.[fig3 ] respectively . for these diagrams , the intermediate state @xmath119 represents all possible cp - even higgs bosons in the corresponding model , i.e. @xmath41 and @xmath42 in type - ii 2hdm and l2hdm and @xmath41 , @xmath42 and @xmath120 in nmssm and nmssm . in order to take into account the possible resonance effects of @xmath119 in fig.[fig1](c ) for @xmath2 and fig.[fig3 ] ( a ) for @xmath11 , we have calculated all the decay modes of @xmath119 and properly included the width effect in its propagator . as to the decay @xmath121 , two points should be noted . one is , unlike the decays @xmath6 and @xmath11 , this process proceeds only through loops mediated by quarks / leptons in the type - ii 2hdm and l2hdm , and additionally by sparticles in the nmssm and nmssm . so in most cases its rate should be much smaller than the other two . the other is due to cp - invariance , loops mediated by squarks / sleptons give no contribution to the decay@xcite . in actual calculation , this is reflected by the fact that the coupling coefficient of @xmath122 differs from that of @xmath123 by a minus sign ( see eq.([asqsq ] ) ) , and as a result , the squark - mediated contributions to @xmath121 are completely canceled out . with regard to the rare decay @xmath11 , we have more explanations . in the lowest order , this decay proceeds by the diagram shown in fig.[fig3 ] ( a ) , and hence one may think that , as a rough estimate , it is enough to only consider the contributions from fig.[fig3](a ) . however , we note that in some cases of the type - ii 2hdm and l2hdm , due to the cancelation of the contributions from different @xmath119 in fig.[fig3 ] ( a ) and also due to the potentially largeness of @xmath124 couplings ( i.e. larger than the electroweak scale @xmath125 ) , the radiative correction from the higgs - mediated loops may dominate over the tree level contribution even when the tree level prediction of the rate , @xmath126 , exceeds @xmath20 . on the other hand , we find the contribution from quark / lepton - mediated loops can be safely neglected if @xmath127 in the type - ii 2hdm and the l2hdm . in the nmssm and the nmssm , besides the corrections from the higgs- and quark / lepton - mediated loops , loops involving sparticles such as squarks , charginos and neutralinos can also contribute to the decay . we numerically checked that the contributions from squarks and charginos can be safely neglected if @xmath127 . we also calculated part of potentially large neutralino correction ( note that there are totally about @xmath128 diagrams for such correction ! ) and found they can be neglected too . since considering all the radiative corrections will make our numerical calculation rather slow , we only include the most important correction , namely that from higgs - mediated loops , in presenting our results for the four models . one can intuitively understand the relative smallness of the sparticle contribution to @xmath11 as follows . first consider the squark contribution which is induced by the @xmath129 interaction ( @xmath130 denotes the squark in chirality state ) and the @xmath131 interaction through box diagrams . because the @xmath132 interaction conserves the chirality of the squarks while the @xmath133 interaction violates the chirality , to get non - zero contribution to @xmath11 from the squark loops , at least four chiral flippings are needed , with three of them provided by @xmath131 interaction and the rest provided by the left - right squark mixing . this means that , if one calculates the amplitude in the chirality basis with the mass insertion method , the amplitude is suppressed by the mixing factor @xmath134 with @xmath135 being the off diagonal element in squark mass matrix . next consider the chargino / neutralino contributions . since for a light @xmath0 , its doublet component , parameterized by @xmath84 in eq.([mixing ] ) , is usually small , the couplings of @xmath0 with the sparticles will never be tremendously large@xcite . so the chargino / neutralino contributions are not important too . in our calculation of the decays , we work in the mass eigenstates of sparticles instead of in the chirality basis . for the type - ii 2hdm and the l2hdm , we consider the following constraints @xcite : * theoretical constraints on @xmath136 from perturbativity , unitarity and requirements that the scalar potential is finit at large field values and contains no flat directions @xcite , which imply that @xmath137 * the constraints from the lep search for neutral higgs bosons . we compute the signals from the higgs - strahlung production @xmath138 ( @xmath139 ) with @xmath140 @xcite and from the associated production @xmath141 with @xmath142 @xcite , and compare them with the corresponding lep data which have been inputted into our code . we also consider the constraints from @xmath138 by looking for a peak of @xmath143 recoil mass distribution of @xmath1-boson @xcite and the constraint of @xmath144 mev when @xmath145 @xcite . + these constraints limit the quantities such as @xmath146 \times br ( h_i \to \bar{b } b ) $ ] on the @xmath147 plane with the the subscript @xmath148 denoting the coupling coefficient of the @xmath149 interaction . they also impose a model - dependent lower bound on @xmath150 , e.g. , @xmath151 for the type - ii 2hdm ( from our scan results ) , @xmath152 for the l2hdm@xcite , and @xmath153 for the nmssm @xcite . these bounds are significantly lower than that of the sm , i.e. @xmath154 , partially because in new physics models , unconventional decay modes of @xmath155 such as @xmath156 are open up . as to the nmssm , another specific reason for allowing a significantly lighter cp - even higgs boson is that the boson may be singlet - dominated in this model . + with regard to the lightest cp - odd higgs boson @xmath0 , we checked that there is no lower bound on its mass so long as the @xmath157 interaction is weak or @xmath155 is sufficiently heavy . * the constraints from the lep search for a light higgs boson via the yukawa process @xmath158 with @xmath22 and @xmath61 denoting a scalar @xcite . these constraints can limit the @xmath159 coupling versus @xmath160 in new physics models . * the constraints from the cleo - iii limit on @xmath161 and the latest babar limits on @xmath162 . these constraints will put very tight constraints on the @xmath163 coupling for @xmath164 . in our analysis , we use the results of fig.8 in the second paper of @xcite to excluded the unfavored points . * the constraints from @xmath165 couplings . since the higgs sector can give sizable higher order corrections to @xmath165 couplings , we calculate them to one loop level and require the corrected @xmath165 couplings to lie within the @xmath166 range of their fitted value . the sm predictions for the couplings at @xmath1-pole are given by @xmath167 and @xmath168 @xcite , and the fitted values are given by @xmath169 and @xmath170 , respectively@xcite . we adopt the formula in @xcite to the 2hdm in our calculation . * the constraints from @xmath171 leptonic decay . we require the new physics correction to the branching ratio @xmath172 to be in the range of @xmath173 @xcite . we use the formula in @xcite in our calculation . + about the constraints ( 5 ) and ( 6 ) , two points should be noted . one is all higgs bosons are involved in the constraints by entering the self energy of @xmath171 lepton , the @xmath174 vertex correction or the @xmath175 vertex correction , and also the box diagrams for @xmath176@xcite . since the yukawa couplings of the higgs bosons to @xmath171 lepton get enhanced by @xmath54 and so do the corrections , @xmath54 must be upper bounded for given spectrum of the higgs sector . generally speaking , the lighter @xmath0 is , the more tightly @xmath54 is limited@xcite . the other point is in the type - ii 2hdm , @xmath177 , b - physics observables as well as @xmath178 decays discussed above can constraint the model in a tighter way than the constraints ( 5 ) and ( 6 ) since the yukawa couplings of @xmath171 lepton and @xmath179 quark are simultaneously enhanced by @xmath54 . but for the l2hdm , because only the yukawa couplings of @xmath171 lepton get enhanced ( see eq.[yukawa ] ) , the constraints ( 5 ) and ( 6 ) are more important in limiting @xmath54 . * indirect constraints from the precision electroweak observables such as @xmath180 , @xmath181 and @xmath182 , or their combinations @xmath183 @xcite . we require @xmath184 to be compatible with the lep / sld data at @xmath185 confidence level@xcite . we also require new physics prediction of @xmath186 is within the @xmath187 range of its experimental value . the latest results for @xmath188 are @xmath189 ( measured value ) and @xmath190 ( sm prediction ) for @xmath191 gev @xcite . in our code , we adopt the formula for these observables presented in @xcite to the type - ii 2hdm and the l2hdm respectively . + in calculating @xmath180 , @xmath181 and @xmath182 , we note that these observables get dominant contributions from the self energies of the gauge bosons @xmath1 , @xmath192 and @xmath193 . since there is no @xmath194 coupling or @xmath195 coupling , @xmath0 must be associated with the other higgs bosons to contribute to the self energies . so by the uv convergence of these quantities , one can infer that , for the case of a light @xmath0 and @xmath196 , these quantities depend on the spectrum of the higgs sector in a way like @xmath197 at leading order , which implies that a light @xmath0 can still survive the constraints from the precision electroweak observables given the splitting between @xmath150 and @xmath198 is moderate@xcite . * the constraints from b physics observables such as the branching ratios for @xmath199 , @xmath200 and @xmath201 , and the mass differences @xmath202 and @xmath203 . we require their theoretical predications to agree with the corresponding experimental values at @xmath187 level . + in the type - ii 2hdm and the l2hdm , only the charged higgs boson contributes to these observables by loops , so one can expect that @xmath198 versus @xmath54 is to be limited . combined analysis of the limits in the type - ii 2hdm has been done by the ckmfitter group , and the lower bound of @xmath204 as a function of @xmath87 was given in fig.11 of @xcite . this analysis indicates that @xmath198 must be heavier than @xmath205 at @xmath185 c.l . regardless the value of @xmath54 . in this work , we use the results of fig.11 in @xcite to exclude the unfavored points . as for the l2hdm , b physics actually can not put any constraints@xcite because in this model the couplings of the charged higgs boson to quarks are proportional to @xmath206 and in the case of large @xmath54 which we are interested in , they are suppressed . in our analysis of the l2hdm , we impose the lep bound on @xmath198 , i.e. @xmath207@xcite . * the constraints from the muon anomalous magnetic moment @xmath208 . now both the theoretical prediction and the experimental measured value of @xmath208 have reached a remarkable precision , but a significant deviation still exists : @xmath209 @xcite . in the 2hdm , @xmath208 gets additional contributions from the one - loop diagrams induced by the higgs bosons and also from the two - loop barr - zee diagrams mediated by @xmath0 and @xmath155@xcite . if the higgs bosons are much heavier than @xmath25 lepton mass , the contributions from the barr - zee diagrams are more important , and to efficiently alleviate the discrepancy of @xmath208 , one needs a light @xmath0 along with its enhanced couplings to @xmath25 lepton and also to heavy fermions such as bottom quark and @xmath171 lepton to push up the effects of the barr - zee diagram@xcite . the cp - even higgs bosons are usually preferred to be heavy since their contributions to @xmath208 are negative . + in the type - ii 2hdm , because @xmath54 is tightly constrained by the process @xmath210 at the lep@xcite and the @xmath178 decay@xcite , the barr - zee diagram contribution is insufficient to enhance @xmath208 to @xmath187 range around its measured value@xcite . so in our analysis , we require the type - ii 2hdm to explain @xmath208 at @xmath211 level . while for the l2hdm , @xmath54 is less constrained compared with the type - ii 2hdm , and the barr - zee diagram involving the @xmath171-loop is capable to push up greatly the theoretical prediction of @xmath208@xcite . therefore , we require the l2hdm to explain the discrepancy at @xmath187 level . + unlike the other constraints discussed above , the @xmath208 constraint will put a two - sided bound on @xmath54 since on the one hand , it needs a large @xmath54 to enhance the barr - zee contribution , but on the other hand , too large @xmath54 will result in an unacceptable large @xmath208 . * since this paper concentrates on a light @xmath0 , the decay @xmath212 is open up with a possible large decay width . we require the width of any higgs boson to be smaller than its mass to avoid a too fat higgs boson@xcite . we checked that for the scenario characterized by @xmath213 , the coefficient of @xmath214 interaction is usually larger than the electroweak scale @xmath125 , and consequently a large decay width is resulted . for the nmssm and nmssm , the above constraints become more complicated because in these models , not only more higgs bosons are involved in , but also sparticles enter the constraints . so it is not easy to understand some of the constraints intuitively . take the process @xmath199 as an example . in the supersymmetric models , besides the charged higgs contribution , chargino loops , gluino loops as well as neutralino loops also contribute to the process@xcite , and depending on the susy parameters , any of these contributions may become dominated over or be canceled by other contributions . as a result , although the charged higgs affects the process in the same way as that in the type - ii 2hdm , charged higgs as light as @xmath215 is still allowed even for @xmath216@xcite . since among the constraints , @xmath208 is rather peculiar in that it needs new physics to explain the discrepancy between @xmath217 and @xmath218 , we discuss more about its dependence on susy parameters . in the nmssm and the nmssm , @xmath208 receives contributions from higgs loops and neutralino / chargino loops . for the higgs contribution , it is quite similar to that of the type - ii 2hdm except that more higgs bosons are involved in@xcite . for the neutralino / chargino contribution , in the light bino limit ( i.e. @xmath219 ) , it can be approximated by@xcite @xmath220 for @xmath221 with @xmath222 being smuon mass . so combining the two contributions together , one can learn that a light @xmath0 along with large @xmath54 and/or light smuon with moderate @xmath87 are favored to dilute the discrepancy . because more parameters are involved in the constraints on the supersymmetric models , we consider following additional constraints to further limit their parameters : * direct bounds on sparticle masses from the lep1 , the lep2 and the tevatron experiments @xcite . * the lep1 bound on invisible z decay @xmath223 ; the lep2 bound on neutralino production @xmath224 and @xmath225@xcite . * dark matter constraints from the wmap relic density 0.0975 @xmath226 0.1213 @xcite . note that among the above constraints , the constraint ( 2 ) on higgs sector and the constraint ( c ) on neutralino sector are very important . this is because in the supersymmetric models , the sm - like higgs is upper bounded by about @xmath227 at tree level and by about @xmath228 at loop level , and that the relic density restricts the lsp annihilation cross section in a certain narrow range . in our analysis of the nmssm , we calculate the constraints ( 3 ) and ( 5 - 7 ) by ourselves and utilize the code nmssmtools @xcite to implement the rest constraints . we also extend nmssmtools to the nmssm to implement the constraints . for the extension , the most difficult thing we faced is how to adapt the code micromegas@xcite to the nmssm case . we solve this problem by noting the following facts : * as we mentioned before , the nmssm is actually same as the nmssm with the trilinear singlet term setting to zero . so we can utilize the model file of the nmssm as the input of the micromegas and set @xmath229 . * since in the nmssm , the lsp is too light to annihilate into higgs pairs , there is no need to reconstruct the effective higgs potential to calculate precisely the annihilation channel @xmath230 with @xmath61 denoting any of higgs bosons@xcite . we thank the authors of the nmssmtools for helpful discussion on this issue when we finish such extension@xcite . with the above constraints , we perform four independent random scans over the parameter space of the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively . we vary the parameters in following ranges : @xmath231 for the type - ii 2hdm , @xmath232 for the l2hdm , @xmath233 for the nmssm , and @xmath234 for the nmssm . in performing the scans , we note that for the nmssm and the nmssm , some constraints also rely on the gaugino masses and the soft breaking parameters in the squark sector and the slepton sector . since these parameters affect little on the properties of @xmath0 , we fix them to reduce the number of free parameters in our scan . for the squark sector , we adopt the @xmath235 scenario which assumes that the soft mass parameters for the third generation squarks are degenerate : @xmath236 800 gev , and that the trilinear couplings of the third generation squarks are also degenerate , @xmath237 with @xmath238 . for the slepton sector , we assume all the soft - breaking masses and trilinear parameters to be 100 gev . this setting is necessary for the nmssm since this model is difficult to explain the muon anomalous moment at @xmath239 level for heavy sleptons@xcite . finally , we assume the grand unification relation @xmath240 for the gaugino masses with @xmath241 being fine structure constants of the different gauge group . with large number of random points in the scans , we finally get about @xmath242 , @xmath243 , @xmath244 and @xmath242 samples for the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively which survive the constraints and satisfy @xmath245 . analyzing the properties of the @xmath0 indicates that for most of the surviving points in the nmssm and the nmssm , its dominant component is the singlet field ( numerically speaking , @xmath246 ) so that its couplings to the sm fermions are suppressed@xcite . our analysis also indicates that the main decay products of @xmath0 are @xmath247 for the l2hdm@xcite , @xmath248 ( dominant ) and @xmath247 ( subdominant ) for the type - ii 2hdm , the nmssm and the nmssm , and in some rare cases , neutralino pairs in the nmssm@xcite . in fig.[fig4 ] , we project the surviving samples on the @xmath249 plane . this figure shows that the allowed range of @xmath54 is from @xmath250 to @xmath251 in the type - ii 2hdm , and from @xmath252 to @xmath253 in the l2hdm . just as we introduced before , the lower bounds of @xmath254 come from the fact that we require the models to explain the muon anomalous moment , while the upper bound is due to we have imposed the constraint from the lep process @xmath255 , which have limited the upper reach of the @xmath256 coupling for light @xmath61 @xcite(for the dependence of @xmath256 coupling on @xmath54 , see sec . this figure also indicates that for the nmssm and the nmssm , @xmath54 is upper bounded by @xmath257 . for the nmssm , this is because large @xmath87 can suppress the dark matter mass to make its annihilation difficult ( see @xcite and also sec . ii ) , but for the nmssm , this is because we choose a light slepton mass so that large @xmath54 can enhance @xmath208 too significantly to be experimentally unacceptable . we checked that for the slepton mass as heavy as @xmath258 , @xmath259 is still allowed for the nmssm . in fig.[fig5 ] and fig.[fig6 ] , we show the branching ratios of @xmath260 and @xmath261 respectively . fig.[fig5 ] indicates , among the four models , the type - ii 2hdm predicts the largest ratio for @xmath260 with its value varying from @xmath262 to @xmath263 . the underlying reason is in the type - ii 2hdm , the @xmath264 coupling is enhanced by @xmath54 ( see fig.[fig4 ] ) , while in the other three model , the coupling is suppressed either by @xmath265 or by the singlet component of the @xmath0 . fig.[fig6 ] shows that the l2hdm predicts the largest rate for @xmath266 with its value reaching @xmath5 in optimum case , and for the other three models , the ratio of @xmath261 is at least about one order smaller than that of @xmath267 . this feature can be easily understood from the @xmath268 coupling introduced in sect . we emphasize that , if the nature prefers a light @xmath0 , @xmath260 and/or @xmath269 in the type - ii 2hdm and the l2hdm will be observable at the gigaz . then by the rates of the two decays , one can determine whether the type - ii 2hdm or the l2hdm is the right theory . on the other hand , if both decays are observed with small rates or fail to be observed , the singlet extensions of the mssm are favored . in fig.[fig7 ] , we show the rate of @xmath3 as the function of @xmath270 . this figure indicates that the branching ratio of @xmath121 can reach @xmath271 , @xmath272 , @xmath273 and @xmath274 for the optimal cases of the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively , which implies that the decay @xmath121 will never be observable at the gigaz if the studied model is chosen by nature . the reason for the smallness is , as we pointed out before , that the decay @xmath121 proceeds only at loop level . comparing the optimum cases of the type - ii 2hdm , the nmssm and the nmssm shown in fig.5 - 7 , one may find that the relation @xmath275 holds for any of the decays . this is because the decays are all induced by the yukawa couplings with similar structure for the models . in the supersymmetric models , the large singlet component of the light @xmath0 is to suppress the yukawa couplings , and the @xmath0 in the nmssm has more singlet component than that in the nmssm . next we consider the decay @xmath11 , which , unlike the above decays , depends on the higgs self interactions . in fig.[fig8 ] we plot its rate as a function of @xmath270 and this figure indicates that the @xmath276 may be the largest among the ratios of the exotic @xmath1 decays , reaching @xmath277 in the optimum cases of the type - ii 2hdm , the l2hdm and the nmssm . the underlying reason is , in some cases , the intermediate state @xmath119 in fig.[fig3 ] ( a ) may be on - shell . in fact , we find this is one of the main differences between the nmssm and the nmssm , that is , in the nmssm , @xmath119 in fig.[fig3 ] ( a ) may be on - shell ( corresponds to the points with large @xmath278 ) while in the nmssm , this seems impossible . so we conclude that the decay @xmath11 may serve as an alternative channel to test new physics models , especially it may be used to distinguish the nmssm from the nmssm if the supersymmetry is found at the lhc and the @xmath11 is observed at the gigaz with large rate . before we end our discussion , we note that in the nmssm , the higgs boson @xmath0 may be lighter than @xmath279 without conflicting with low energy data from @xmath178 decays and the other observables ( see fig.[fig4]-[fig8 ] ) . in this case , @xmath0 is axion - like as pointed out in @xcite . we checked that , among the rare @xmath1 decays discussed in this paper , the largest branching ratio comes from @xmath280 which can reach @xmath281 . since in this case , the decay product of @xmath0 is highly collinear muon pair , detecting the decay @xmath280 may need some knowledge about detectors , which is beyond our discussion . in this paper , we studied the rare @xmath1-decays @xmath2 ( @xmath7 ) , @xmath282 and @xmath4 in the type - ii 2hdm , lepton - specific 2hdm , nmssm and nmssm , which predict a light cp - odd higgs boson @xmath0 . in the parameter space allowed by current experiments , the branching ratio can be as large as @xmath5 for @xmath118 , @xmath8 for @xmath3 and @xmath9 for @xmath4 , which implies that the decays @xmath2 and @xmath283 may be accessible at the gigaz option . since different models predict different size of branching ratios , these decays can be used to distinguish different model through the measurement of these rare decays . this work was supported in part by hastit under grant no . 2009hastit004 , by the national natural science foundation of china ( nnsfc ) under grant nos . 10821504 , 10725526 , 10635030 , 10775039 , 11075045 and by the project of knowledge innovation program ( pkip ) of chinese academy of sciences under grant no . . for some reviews , see , e.g. , m. a. perez , g. tavares - velasco and j. j. toscano , int . j. mod . a * 19 * , 159 ( 2004 ) ; j. m. yang , arxiv:1006.2594 . j. i. illana , m. masip , 67 , 035004 ( 2003 ) ; j. cao , z. xiong , j. m. yang , 32 , 245 ( 2004 ) . d. atwood _ et al_. , 66 , 093005 ( 2002 ) . j. kalinowski , and s. pokorski , 219 , 116 ( 1989 ) ; a. djouadi , p. m. zerwas and j. zunft , 259 , 175 ( 1991 ) ; a. djouadi , j. kalinowski , and p. m. zerwas , z. phys . c * 54 * , 255 ( 1992 ) . m. krawczyk , _ et al . _ , 19 , 463 ( 2001 ) ; 8 , 495 ( 1999 ) . j. f. gunion , g. gamberini and s. f. novaes , 38 , 3481 ( 1988 ) ; thomas j. weiler and tzu - chiang yuan , 318 , 337 ( 1989 ) ; a. djouadi , _ et al . _ , 1 , 163 ( 1998)[hep - ph/9701342 ] . d. chang and w. y. keung , phys . lett . * 77 * , 3732 ( 1996 ) . e. keith and e. ma , 57 , 2017 ( 1998 ) ; m. a. perez , g. tavares - velasco and j. j. toscano , int . j. mod.phys . a * 19 * , 159 ( 2004 ) . f. larios , g. tavares - velasco and c. p. yuan , 64 , 055004 ( 2001 ) ; 66 , 075006 ( 2002 ) . a. djouadi , _ et al . _ , 10 , 27 ( 1999 ) [ hep - ph/9903229 ] . for a detailed introduction of the nmssm , see f. franke and h. fraas , int . j. mod . a * 12 * ( 1997 ) 479 ; for a recent review of the nmssm , see for example , u. ellwanger , c. hugonie , and a. m. teixeira , arxiv : 0910.1785 . see , e.g. , j. r. ellis , j. f. gunion , h. e. haber , l. roszkowski and f. zwirner , phys . rev . d * 39 * ( 1989 ) 844 ; m. drees , int . j. mod . phys . a * 4 * ( 1989 ) 3635 ; u. ellwanger , m. rausch de traubenberg and c. a. savoy , phys . b * 315 * ( 1993 ) 331 ; nucl . b * 492 * ( 1997 ) 21 ; d.j . miller , r. nevzorov , p.m. zerwas , 681 , 3 ( 2004 ) . c. panagiotakopoulos , k. tamvakis , 446 , 224 ( 1999 ) ; 469 , 145 ( 1999 ) ; c. panagiotakopoulos , a. pilaftsis , 63 , 055003 ( 2001 ) ; a. dedes , _ et al . _ , 63 , 055009 ( 2001 ) ; a. menon , _ et al . _ , 70 , 035005 ( 2004 ) ; v. barger , _ et al . _ , 630 , 85 ( 2005 ) . c. balazs , _ et al . _ , 0706 , 066 ( 2007 ) . b. a. dobrescu , k. t. matchev , 0009 , 031 ( 2000 ) ; a. arhrib , k. cheung , t. j. hou , k. w. song , hep - ph/0611211 ; 0703 , 073 ( 2007 ) ; x. g. he , j. tandean , and g. valencia , 98 , 081802 ( 2007 ) ; 0806 , 002 ( 2008 ) ; f. domingo _ et al_. , 0901 , 061 ( 2009 ) ; gudrun hiller , 70 , 034018 ( 2004 ) ; r. dermisek , and john f. gunion , 75 , 075019 ( 2007 ) ; 79 , 055014 ( 2009 ) ; 81 , 055001 ( 2010 ) ; r. dermisek , john f. gunion , and b. mcelrath , 76 , 051105 ( 2007 ) ; z. heng , _ et al_. , 77 , 095012 ( 2008 ) ; a. belyaev _ et al_. , 81 , 075021 ( 2010 ) ; d. das and u. ellwanger , arxiv:1007.1151 [ hep - ph ] . s. andreas , o. lebedev , s. ramos - sanchez and a. ringwald , arxiv:1005.3978 [ hep - ph ] . j. f. gunion , jhep * 0908 * , 032 ( 2009 ) ; r. dermisek and j. f. gunion , phys . rev . d * 81 * , 075003 ( 2010 ) . r. dermisek and j. f. gunion , phys . lett . * 95 * , 041801 ( 2005 ) ; phys . d * 73 * , 111701 ( 2006 ) . j. cao , h. e. logan , j. m. yang , 79 , 091701 ( 2009 ) . j. cao , p. wan , l. wu , j. m. yang , 80 , 071701 ( 2009 ) . j. f. gunion and h. e. haber , 67 , 075019 ( 2003 ) . r. m. barnett , _ et al . _ , phys . b * 136 * , 191 ( 1984 ) ; r. m. barnett , g. senjanovic and d. wyler , phys . d * 30 * , 1529 ( 1984 ) ; y. grossman , nucl . b * 426 * , 355 ( 1994 ) . h. s. goh , l. j. hall and p. kumar , jhep * 0905 * , 097 ( 2009 ) ; a. g. akeroyd and w. j. stirling , nucl . b * 447 * , 3 ( 1995 ) ; a. g. akeroyd , phys . b * 377 * , 95 ( 1996 ) ; h. e. logan and d. maclennan , phys . rev . d * 79 * , 115022 ( 2009 ) ; m. aoki , _ et al . _ , arxiv:0902.4665 [ hep - ph ] . v. barger , p. langacker , h. s. lee and g. shaughnessy , phys . d * 73 * , 115010 ( 2006 ) . s. hesselbach , _ et . _ , arxiv:0810.0511v2 [ hep - ph ] . de vivie and p. janot [ aleph collaboration ] , pa13 - 027 contribution to the international conference on high energy physics , warsaw , poland , 2531 july 1996 ; j. kurowska , o. grajek and p. zalewski [ delphi collaboration ] , cern - open-99 - 385 . [ aleph collaboration and delphi collaboration and l3 collaboration ] , phys . rept . * 427 * , 257 ( 2006 ) . j. cao and j. m. yang , jhep * 0812 * , 006 ( 2008 ) . m. krawczyk and d. temes , eur . j. c * 44 * , 435 ( 2005 ) . g. altarelli and r. barbieri , 253 , 161 ( 1991 ) ; m. e. peskin , t. takeuchi , 46 , 381 ( 1992 ) . c. amsler , _ et al . _ , ( particle data group ) , 667 , 1 ( 2008 ) . o. deschamps , s. descotes - genon , s. monteil , v. niess , s. tjampens and v. tisserand , arxiv:0907.5135 [ hep - ph ] . s. su and b. thomas , phys . d * 79 * , 095014 ( 2009 ) . g. abbiendi , _ et al . _ , eur . phys . j. c * 32 * , 453 ( 2004 ) . m. davier , _ et al . _ , 66 , 1 ( 2010 ) . k. cheung , _ et al . _ , phys . d * 64 * , 111301 ( 2001 ) . k. cheung and o. c. w. kong , phys . d * 68 * , 053003 ( 2003 ) . t. besmer , c. greub , t.hurth , 609 , 359 ( 2001 ) ; f. borzumati , _ et al . _ , 62 , 075005(2000 ) . j. cao , k. i. hikasa , w. wang , j. m. yang and l. x. yu , phys . d * 82 * , 051701 ( 2010 ) [ arxiv:1006.4811 [ hep - ph ] ] . j. f. gunion , _ et . d * 73 * , 015011 ( 2006 ) . martin and j. d. wells , phys . d * 64 * , 035003 ( 2001 ) . j. abdallah _ et al . _ , eur . j. c * 31 * , 421 ( 2004 ) ; g. abbiendi _ et al . _ , eur . j. c * 35 * , 1 ( 2004 ) . j. dunkley _ et al . _ [ wmap collaboration ] , astrophys . j. suppl . * 180 * , 306 ( 2009 ) [ arxiv:0803.0586 [ astro - ph ] ] . u. ellwanger _ et al . _ , 02 , 066 ( 2005 ) . g. belanger , f. boudjema , a. pukhov and a. semenov , comput . commun . * 174 * , 577 ( 2006 ) ; comput . phys . commun . * 176 * , 367 ( 2007 ) . g. belanger , f. boudjema , c. hugonie , a. pukhov and a. semenov , jcap * 0509 * , 001 ( 2005 ) .""" + + ARTICLE_MAGNET = r"""it is well known that the classical magnetoresistance ( mr ) in metals or semiconductors with a closed free electron fermi surface increases quadratically with increasing magnetic field @xmath2 for @xmath3 and saturates when @xmath4 . here @xmath5 is the zero - magnetic - field mobility . hence , the extraordinarily high and linear mr ( lmr ) , which breaks this familiar rule , has been gaining much attention as soon as its discovery . in the past decade , this unexpected lmr has been reported in silver chalcogenide,@xcite indium antimonide,@xcite silicon,@xcite mnas - gaas composite material,@xcite and graphene.@xcite kapitza s linear law@xcite indicates that the metal shows a magnetoresistance linear in perpendicular magnetic field when it has an open fermi surface and a mean free path longer than the electronic larmor radius . recently , another two models , irrespective of the open fermi surface , have been constructed to provide possible mechanisms for the lmr phenomenon . abrikosov suggested a quantum - limit origin of lmr for the homogenous system with a gapless linear energy spectrum.@xcite his model requires that landau levels are well formed and the carrier concentration is small that all electrons occupy only the lowest landau band . alternatively , parish and littlewood developed a classical model without involving linear spectrum.@xcite ignoring the concrete microscopic mechanism , they attributed this unusual mr to the mobility fluctuations in a strongly inhomogenous system . topological insulators@xcite ( tis ) are novel materials with a full energy gap in bulk , while there are gapless surface states . due to its unique band structure with only one helical dirac cone and linear energy dispersion,@xcite the surface states of the ti bi@xmath0se@xmath1 become an excellent platform for the study of quantum - limit lmr . the recent experiment in this flat surface system , however , reported that a large positive mr , which becomes very linear above a characteristic field of @xmath6@xmath7@xmath8 t , was observed even in an opposite situation where the carrier sheet density is high that electrons occupy more than one landau levels.@xcite moreover , they found that raising temperature to room temperature almost has no influence on the observed lmr . it is striking that this observation is in conflict with abrikosov s model and also with the classical parish - littlewood model . so far a reliable theoretical scheme capable of explaining this novel experiment has still been lacking . in this paper , we generalize the balance - equation approach@xcite to a system modeling the surface states of a three - dimensional ti to investigate the two - dimensional magnetotransport in it . we find that a positive , nonsaturating and dominantly linear magnetoresistance can appear within quite wide magnetic - field range in the ti surface state having a positive and finite effective g - factor . this linear magnetoresistance shows up in the system of high carrier concentration and low mobility when electrons are in extended states and spread over many smeared landau levels , and persists up to room temperature , providing a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons.@xcite we consider the surface state of a bi@xmath0se@xmath1-type large bulk gap ti in the @xmath9-@xmath10 plane under the influence of a uniform magnetic field @xmath11 applied along the @xmath12 direction.@xcite following the experimental observation,@xcite we assume that the fermi energy locates in the gap of the bulk band and above the dirac point , i.e. the surface carriers are electrons . further , the separations of the fermi energy from the bottom of bulk band and dirac point are much larger than the highest temperature ( @xmath13 ) considered in this work . hence , the contribution from the bulk band to the magnetotransport is negligible . these electrons , scattered by randomly distributed impurities and by phonons , are driven by a uniform in - plane electric field @xmath14 in the topological surface . the hamiltonian of this many - electron and phonon system consists of an electron part @xmath15 , a phonon part @xmath16 , and electron - impurity and electron - phonon interactions @xmath17 and @xmath18 : @xmath19 here , the electron hamiltonian is taken in the form @xmath20 , \ ] ] in which @xmath21 , @xmath22 , @xmath23 and @xmath24 , stand , respectively , for the canonical momentum , coordinate , momentum and spin operators of the @xmath25th electron having charge @xmath26 , @xmath27 is the vector potential of the perpendicular magnetic field @xmath28 in the landau gauge , @xmath29 is the fermi velocity , @xmath30 is the effective g - factor of the surface electron , and @xmath31 is the bohr magneton with @xmath32 the free electron mass . the sum index @xmath25 in eq.([helectron ] ) goes over all electrons of total number @xmath33 in the surface state of unit area . in the frame work of balance equation approach,@xcite the two - dimensional center - of - mass ( c.m . ) momentum and coordinate @xmath34 and @xmath35 , and the relative - electron momenta and coordinates @xmath36 and @xmath37 are introduced to write the hamiltonian @xmath15 into the sum of a single - particle c.m . part @xmath38 and a many - particle relative - electron part @xmath39 : @xmath40 , with @xmath41.\end{aligned}\ ] ] in this , @xmath42 is the canonical momentum of the center - of - mass and @xmath43 is the canonical momentum for the @xmath25th relative electron . here we have also introduced c.m . spin operators @xmath44 and @xmath45 . the commutation relations between the c.m . spin operators @xmath46 and @xmath47 and the spin operators @xmath48 , @xmath49 and @xmath50 of the @xmath25th electron are of order of @xmath51 : @xmath52= n^{-1}2\,{\rm i}\,\varepsi lon_{\beta_1\beta_2\beta_3}\sigma_j^{\beta_3}$ ] with @xmath53 . therefore , for a macroscopic large @xmath33 system , the c.m . part @xmath38 actually commutes with the relative - electron part @xmath54 in the hamiltonian , i.e. the c.m . motion and the relative motion of electrons are truly separated from each other . the couplings between the two emerge only through the electron impurity and electron phonon interactions . furthermore , the electric field @xmath55 shows up only in @xmath38 . and , in view of @xmath56={\rm i}\delta_{\alpha \beta}(\delta_{ij}-1/n)\simeq { \rm i}\delta_{\alpha\beta}\delta_{ij}$ ] , i.e. the relative - electron momenta and coordinates can be treated as canonical conjugate variables , the relative - motion part @xmath54 is just the hamiltonian of @xmath33 electrons in the surface state of ti in the magnetic field without the presence of the electric field . in terms of the c.m . coordinate @xmath57 and the relative electron density operator @xmath58 , the electron impurity and electron phonon interactions can be written as@xcite @xmath59 here @xmath60 and @xmath61 are respectively the impurity potential ( an impurity at randomly distributed position @xmath62 ) and electron phonon coupling matrix element in the plane - wave representation , and @xmath63 with @xmath64 and @xmath65 being the creation and annihilation operators for a phonon of wavevector @xmath66 in branch @xmath67 having frequency @xmath68 . velocity ( operator ) @xmath69 is the time variation of its coordinate : @xmath70= v_{\rm f}(\sigma_{\rm c}^y\ , \hat{i}-\sigma_{\rm c}^x\ , \hat{j})$ ] . to derive a force - balance equation for steady state transport we consider the heisenberg equation for the rate of change of the c.m . canonical momentum @xmath71 : @xmath72= - n e({\bm v}\times { \bm b})- n e{\bm e}+{\bm { f}}_{\rm i}+{\bm { f}}_{\rm p},\ ] ] in which the frictional forces @xmath73 and @xmath74 share the same expressions as given in ref .. the statistical average of the operator equation can be determined to linear order in the electron impurity and electron phonon interactions @xmath17 and @xmath18 with the initial density matrix @xmath75 at temperature @xmath76 when the in - plane electric field @xmath77 is not strong . for steady - transport states we have @xmath78 , leading to a force - balance equation of the form @xmath79 here @xmath80 , the statistically averaged velocity of the moving center - of - mass , is identified as the average rate of change of its position , i.e. the drift velocity of the electron system driven by the electric field @xmath77 , and @xmath81 and @xmath82 are frictional forces experienced by the center - of - mass due to impurity and phonon scatterings : @xmath83,\label{fp}\end{aligned}\ ] ] in which @xmath84 is the bose distribution function , @xmath85 , and @xmath86 stands for the imaginary part of the fourier spectrum of the relative - electron density correlation function defined by @xmath87\big\rangle_{0},\ ] ] where @xmath88 and @xmath89 denotes the statistical averaging over the initial density matrix @xmath90.@xcite the force - balance equation describes the steady - state two - dimensional magnetotransport in the surface state of a ti . note that the frictional forces @xmath81 and @xmath82 are in the opposite direction of the drift velocity @xmath91 and their magnitudes are functions of @xmath92 only . with the drift velocity @xmath93 in the @xmath9 direction , the force - balance equation eq . yields a transverse resistivity @xmath94 , and a longitudinal resistivity @xmath95 . the linear one is in the form @xmath96 for calculating the electron density correlation function @xmath97 we proceed in the landau representation.@xcite the landau levels of the single - particle hamiltonian @xmath98 of the relative - electron system in the absence of electric field are composed of a positive `` @xmath99 '' and a negative `` @xmath100 '' branch@xcite @xmath101 with @xmath102 and @xmath103 , and a zero ( @xmath104 ) level @xmath105 the corresponding landau wave functions are @xmath106 and @xmath107 for @xmath108 ; and @xmath109 for @xmath104 . here @xmath110 is the wavevector of the system along @xmath9 direction ; @xmath111 with @xmath112 ; and @xmath113 is the harmonic oscillator eigenfunction with @xmath114 being the hermite polynomial , @xmath115 , and @xmath116 . each landau level contains @xmath117 electron states for system of unit surface area . the positive branch @xmath118 and the @xmath104 level @xmath119 of the above energy spectra are indeed quite close to those of the surface states in the bulk gap of bi@xmath0se@xmath1-family materials derived from microscopic band calculation.@xcite the landau levels are broadened due to impurity , phonon and electron - electron scatterings . we model the imaginary part of the retarded green s function , or the density - of - states , of the broadened landau level @xmath120 ( written for `` + ' ' -branch and @xmath104 levels ) , using a gaussian - type form:@xcite @xmath121,\ ] ] with a half - width @xmath122 of the form:@xcite @xmath123^{1/2}$ ] . here @xmath124 is the single - particle lifetime and @xmath125 is the cyclotron frequency of linear - energy - dispersion system with @xmath126 being the zero - temperature fermi level . using a semi - empirical parameter @xmath127 to relate @xmath124 with the transport scattering time @xmath128 , and expressing @xmath129 with the zero - field mobility @xmath5 at finite temperature,@xcite we can write the landau - level broadening as @xmath130^{1/2}.\ ] ] in the present study we consider the case of @xmath120-doping , i.e. the fermi level is high enough above the energy zero of the dirac cone in the range of `` + ' ' -branch levels and the states of `` @xmath100''-branch levels are completely filled , that they are irrelevant to electron transport . special attention has to be paid to the @xmath104 level , since , depending on the direction of exchange potential the effective g - factor of a ti surface state , @xmath30 , can be positive , zero or negative.@xcite the sign and magnitude of the effective g - factor determines how many states of the zero level should be included in or excluded from the available states for electron occupation in the case of @xmath120-doping at a magnetic field . ( i ) if @xmath131 , the @xmath104 level center is exactly at @xmath132 and the system is electron - hole symmetric . the total number of negative energy states ( including the states of the lower half of the @xmath104 level and states of the @xmath100"-branch levels ) and that of positive energy states ( including the states of the upper half of the @xmath104 level and states of the @xmath99"-branch levels ) do not change when changing magnetic field . therefore , the lower - half negative energy states of this level are always filled and the upper - half positive - energy states of it are available for the occupation of particles which are counted as electrons participating in transport in the case of @xmath120-doping . ( ii ) for a finite positive @xmath133 , the @xmath104 level @xmath134 moves downward to negative energy and its distance to the nearest @xmath100"-branch level is @xmath135 closer than to the nearest + " -branch level at finite magnetic field strength @xmath2 . this is equivalent to the opening of an increasingly enlarged ( with increasing @xmath2 ) energy gap between the + " -branch states and the states of the zero - level and the @xmath100"-branch levels . the opening of a sufficient energy gap implies that with increasing magnetic field the states in the + " -branch levels would no longer shrink into the zero - level , and thus the @xmath104 level should be completely excluded from the conduction band , i.e. only particles occupying the + " -branch states are counted as electrons participating in transport in the case of @xmath120-doping , when the magnetic field @xmath2 gets larger than a certain value ( depending on the magnitude of @xmath30 ) . ( iii ) for a finite negative @xmath136 , the @xmath104 level @xmath134 moves upward to positive energy and an increasingly enlarged energy gap will be opened between the states of the zero - level and the + " -branch and the states of @xmath100"-branch levels , and particles occupying the @xmath104 level and + " -branch states are electrons participating in transport when the magnetic field @xmath2 gets larger than a certain value . as a result , the experimentally accessible sheet density @xmath33 of electrons participating in transport is related to the fermi energy @xmath137 by the following equation valid at finite @xmath30 for the magnetic field @xmath2 larger than a certain value : @xmath138 in which @xmath139 + 1\}^{-1}$ ] is the fermi distribution function at temperature @xmath76 and the summation index @xmath120 goes over @xmath140 for @xmath133 , or @xmath141 for @xmath136 . in the case of @xmath131 , @xmath142\ ] ] valid for arbitrary magnetic field , in which @xmath143 . the imaginary part of relative - electron density correlation function in the presence of a magnetic field , @xmath86 , can be expressed in the landau representation as@xcite @xmath144 in which the transform factor @xmath145 ^ 2,\end{aligned}\ ] ] with @xmath146 , @xmath147 , @xmath148 , and @xmath149 being associated laguerre polynomials . the landau - representation correlation function @xmath150 in eq.([piqw ] ) can be constructed with the imaginary part of the retarded green s function @xmath151 , or the density - of - states , of the @xmath120th landau level as@xcite @xmath152\nonumber\\ & \hspace{1.2cm}\times{\rm im}g_n(\epsilon+\omega){\rm im}g_{n'}(\epsilon).\end{aligned}\ ] ] the summation indices @xmath120 and @xmath153 in eq.([piqw ] ) are taken over @xmath140 for @xmath133 , or @xmath154 for @xmath136 . in the case of @xmath131 , eq.([piqw ] ) still works and the summation indices @xmath120 and @xmath153 go over @xmath154 but with @xmath155 replaced by @xmath156 in eq.([p2nn ] ) . numerical calculations are performed for the magnetoresistivity @xmath157 of surface state in a uniform ti bi@xmath0se@xmath1 . at zero temperature the elastic scattering contributing to the resistivity is modeled by a coulomb potential due to charged impurities:@xcite @xmath158 with @xmath159 being the impurity density , which is determined by the zero - magnetic - field mobility @xmath5 . at temperatures higher than @xmath160,@xcite phonon scatterings play increasingly important role and the dominant inelastic contribution comes from optical phonons . for this polar material , the scattering by optical phonons via the deformation potential can be neglected . hence , we take account of inelastic scattering from optical phonons via frhlich coupling : @xmath161 . in the numerical calculation we use the following parameters:@xcite fermi velocity @xmath162 , static dielectric constant @xmath163 , optical dielectric constant @xmath164 , and phonon energy @xmath165 . the broadening parameter is taken to be @xmath166 . as a function of the magnetic field @xmath2 having different effective g - factors : @xmath167 and @xmath168 for a ti surface system with electron sheet density @xmath169 in the cases of zero - magnetic - field mobility @xmath170 ( a ) and @xmath171 ( b ) . several integer - number positions of filling factor @xmath172 are marked in ( b).,scaledwidth=40.0% ] fig.[diffg ] shows the calculated magnetoresistivity @xmath157 versus the magnetic field strength @xmath2 for a ti surface system with electron sheet density @xmath169 but having different effective g - factors : @xmath167 and @xmath168 for two values of zero - magnetic - field mobility @xmath170 and @xmath171 , representing different degree of landau - level broadening . in the case without zeeman splitting ( @xmath131 ) the resistivity @xmath157 exhibits almost no change with changing magnetic field up to 10 t , except the shubnikov - de haas ( sdh ) oscillation showing up in the case of @xmath171 . this kind of magnetoresistance behavior was indeed seen experimentally in the electron - hole symmetrical massless system of single - layer graphene.@xcite in the case of a positive g - factor , @xmath173 , the magnetoresistivity increases linearly with increasing magnetic field ; while for a negative g - factor , @xmath174 , the magnetoresistivity decreases linearly with increasing magnetic field . is shown as a function of the magnetic field @xmath2 for different values of zero - magnetic - field mobility : ( a ) @xmath175 , ( b ) @xmath176 , ( c ) @xmath177 , ( d ) @xmath178 , ( e ) @xmath179 , and ( f ) @xmath180 . the inset of ( a ) illustrates the same for a larger magnetic - field range @xmath181 . the filling factor @xmath182 is plotted versus the magnetic field in ( f ) ; and several integer - number positions of @xmath182 are also marked in ( d ) and ( e ) . here the surface electron density @xmath169 and the lattice temperature @xmath183.,scaledwidth=47.0% ] in the following we will give more detailed examination on the linearly increasing magnetoresistance in the positive @xmath30 case . fig.[rhob ] shows the calculated resistivity @xmath157 versus the magnetic field strength @xmath2 at lattice temperature @xmath183 for system of carrier sheet density @xmath169 and @xmath173 , having different zero - field mobility @xmath184 and @xmath180 . all resistivity curves for mobility @xmath185 exhibit clear linearity in the magnetic - field range and appear no tendency of saturation at the highest field shown in the figure . especially , for the case @xmath170 , the linear behavior extends even up to the magnetic field of @xmath186 , as illustrated in the inset of fig.[rhob](a ) . this feature contradicts the classical mr which saturates at sufficiently large magnetic field @xmath187 . note that here we only present the calculated @xmath157 for magnetic field @xmath2 larger than @xmath188 t , for which a sufficient energy gap @xmath135 is assumed to open that with further increase of the magnetic field the states in the `` + ' ' -branch levels no longer shrink into the zero level and thus it should be excluded from the conduction band . this is of course not true for very weak magnetic field . when @xmath189 the energy gap @xmath190 , the situation becomes similar to the case of @xmath131 : the whole upper half of the zero - level states are available to electron occupation and we should have a flat resistivity @xmath157 when changing magnetic field . with increasing @xmath2 the portion of the zero - level states available to conduction electrons decreases until the magnetic field reaches @xmath191 . as a result the resistivity @xmath157 should exhibit a crossover from a flat changing at small @xmath2 to positively linear increasing at @xmath192 . this is just the behavior observed in the ti bi@xmath0se@xmath1.@xcite note that in the case of @xmath170 , the broadened landau - level widths are always larger than the neighboring level interval : @xmath193 , which requires @xmath194 ^ 2 $ ] , even for the lowest landau level @xmath195 , i.e. the whole landau - level spectrum is smeared . with increasing the zero - field mobility the magnitude of resistivity @xmath157 decreases , and when the broadened landau - level width becomes smaller than the neighboring level interval , @xmath196 , a weak sdh oscillation begin to occur around the linearly - dependent average value of @xmath157 at higher portion of the magnetic field range , as seen in fig.[rhob](c ) , ( d ) and ( e ) for @xmath197 and @xmath198 . on the other hand , in the case of large mobility , e.g. @xmath199 , where the broadened landau - level widths @xmath200 are much smaller than the neighboring level interval even for level index @xmath120 as large as @xmath201 , the magnetoresistivity shows pronounced sdh oscillation and the linear - dependent behavior disappears , before the appearance of quantum hall effect,@xcite as shown in fig.[rhob](f ) . abrikosov s model for the lmr requires the applied magnetic field large enough to reach the quantum limit at which all the carriers are within the lowest landau level,@xcite while it is obvious that more than one landau levels are occupied in the experimental samples in the field range in which the linear and non - saturating magnetoresistivity was observed.@xcite for the given electron surface density @xmath202 , the number of occupied landau levels , or the filling factor @xmath172 , at different magnetic fields is shown in fig.[rhob](f ) , as well as in the fig.[rhob](d ) and ( e ) , where the integer - number positions of @xmath203 , i.e. filling up to entire @xmath182 landau levels , coincide with the minima of the density - of - states or the dips of sdh oscillation . this is in contrast with @xmath131 case , where the integer number of @xmath203 , which implies a filling up to the center position of the @xmath182th landau levels , locates at a peak of sdh oscillation , as shown in fig.[diffg]b . the observed sdh oscillations in the bi@xmath0se@xmath1 nanoribbon exhibiting nonsaturating surface lmr in the experiment@xcite favor the former case : a finite positive effective @xmath133 . is plotted as a function of the surface electron density @xmath33 at magnetic field @xmath204 : ( a ) at different values of zero - field mobility @xmath5 , and ( b ) at different values of zero - field conductivity @xmath205.,scaledwidth=40.0% ] at various lattice temperatures . here the zero - magnetic - field mobility at zero temperature is @xmath206.,scaledwidth=35.0% ] next , we examine the density - dependence of the linear magnetoresistivity . to compare with abrikosov s quantum magnetoresistance which suggests a @xmath207 behavior,@xcite we show the calculated @xmath208 for above lmr versus the carrier sheet density @xmath33 in fig.[rhon ] at fixed magnetic field @xmath209 t . the mobility is taken respectively to be @xmath210 and @xmath211m@xmath212/vs to make the resistivity in the lmr regime . a clearly linear dependence of @xmath213 on the surface density @xmath33 is seen in all cases , indicating that this non - saturating linear resistivity is almost inversely proportional to the carrier density . in the figure we also show @xmath208 versus @xmath33 under the condition of different given conductivity @xmath214 and @xmath215 . in this case the half - width @xmath216 is independent of surface density . the linear dependence still holds , indicating that this linear behavior is not sensitive to the modest @xmath33-dependence of landau level broadening @xmath216 as long as the system is in the overlapped landau level regime . from the above discussion , it is obvious that lmr shows up in the system having overlapped landau levels and the separation of landau levels makes the mr departure from the linear increase . at high temperature , the thermal energy would smear the level separation and phonon scatterings further broaden landau levels . hence , it is believed that this lmr will be robust against raising temperature . this is indeed the case as seen in fig.[rhot ] , where we plot the calculated magnetoresistivity @xmath157 for the above system with zero - temperature linear mobility @xmath217m@xmath212/vs versus the magnetic field at different lattice temperatures . we can see that raising temperature to room temperature has little effect on the linearity of mr . due to the decreased mobility at higher temperature from phonon scattering , the weak sdh oscillation on the linear background tends to vanish . these features are in good agreement with the experimental report.@xcite in summary , we have studied the two - dimensional magnetotransport in the flat surface of a three - dimensional ti , which arises from the surface states with a wavevector - linear energy dispersion and a finite , positive zeeman splitting within the bulk energy gap . when the level broadening is comparable to or larger than the landau - level separation and the conduction electrons spread over many landau levels , a positive , dominantly linear and non - saturating magnetoresistance appears within a quite wide range of magnetic field and persists up to room temperature . this remarkable lmr provides a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons.@xcite in contrast to quantum hall effect which appears in the case of well formed landau levels and to abrikosov s quantum magnetotransport,@xcite which is limited to the extreme quantum limit that all electrons coalesce into the lowest landau level , the discussed lmr is a phenomena of pure classical two - dimensional magnetotransport in a system having linear - energy - dispersion , appearing in the regime of overlapped landau levels , irrespective of its showing up in relatively high magnetic field range . furthermore , the present scheme deals with spatially uniform case without invoking the mobility fluctuation in a strongly inhomogeneous system , which is required in the classical parish and littlewood model to produce a lmr.@xcite the appearance of this significant positive - increasing linear magnetoresistance depends on the existence of a positive and sizable effective g - factor . if the zeeman energy splitting is quite small the resistivity @xmath157 would exhibit little change with changing magnetic field . in the case of a negative and sizable effective g - factor the magnetoresistivity would decrease linearly with increasing magnetic field . therefore , the behavior of the longitudinal resistivity versus magnetic field may provide a useful way for judging the direction and the size of the effective zeeman energy splitting in ti surface states . this work was supported by the national science foundation of china ( grant no . 11104002 ) , the national basic research program of china ( grant no . 2012cb927403 ) and by the program for science&technology innovation talents in universities of henan province ( grant no . 2012hastit029 ) .""" + + dct = tok.batch_encode_plus( + [ARTICLE_LEP, ARTICLE_MAGNET], + max_length=6144, + padding="max_length", + truncation=True, + return_tensors="pt", + ) + + hypotheses_batch = hf.generate( + input_ids=dct["input_ids"].to(torch_device), + attention_mask=dct["attention_mask"].to(torch_device), + num_beams=4, + max_length=512, + early_stopping=True, + no_repeat_ngram_size=3, + ) + + EXPECTED_LEP = ( + " the physics of @xmath0-boson will again play the central role in the frontier of particle physics if the" + " gigaz option of the international linear collider ( ilc ) can be realized in its first phase. \n the" + " expected sensitivity to the branching ratio of rare decays, especially its exotic or rare processes," + " should be investigated comprehensively to evaluate their potential in probing new physics. in this work" + " \n, we study the rare decay into light higgs boson(s ) in the framework of the minimal supersymmetric" + " standard model ( mssm ), where a light cp - odd higgs - boson with singlet - dominant component may" + " naturally arise from the spontaneous breaking of some approximate global symmetry. " + ) + + EXPECTED_MAGNET = ( + " the recent experiment in the surface states of the topological insulator bi@xmath0se @xmath1, however," + " reported that a large positive magnetoresistance becomes very linear in perpendicular magnetic field" + " even in an opposite situation where the carrier sheet density is high that all electrons occupy more" + " than one landau levels. \n it is striking that this observation is in conflict with abrikosov s model" + " and also with the classical parish - littlewood model. " + ) + + generated = tok.batch_decode( + hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True + ) + assert generated == [EXPECTED_LEP, EXPECTED_MAGNET] diff --git a/docs/transformers/tests/models/led/test_modeling_tf_led.py b/docs/transformers/tests/models/led/test_modeling_tf_led.py new file mode 100644 index 0000000000000000000000000000000000000000..e63b376d58c9d4b01c4a55e9340f66f88f777935 --- /dev/null +++ b/docs/transformers/tests/models/led/test_modeling_tf_led.py @@ -0,0 +1,342 @@ +# Copyright Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import unittest + +from transformers import LEDConfig, is_tf_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_tf_available(): + import tensorflow as tf + + from transformers import TFLEDForConditionalGeneration, TFLEDModel + + +@require_tf +class TFLEDModelTester: + config_cls = LEDConfig + config_updates = {} + hidden_act = "gelu" + + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_labels=False, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=20, + eos_token_id=2, + pad_token_id=1, + bos_token_id=0, + attention_window=4, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.attention_window = attention_window + + # `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size + # [num_attention_heads, encoder_seq_length, encoder_key_length], but TFLongformerSelfAttention + # returns attention of shape [num_attention_heads, encoder_seq_length, self.attention_window + 1] + # because its local attention only attends to `self.attention_window` and one before and one after + self.key_length = self.attention_window + 2 + + # because of padding `encoder_seq_length`, is different from `seq_length`. Relevant for + # the `test_attention_outputs` and `test_hidden_states_output` tests + self.encoder_seq_length = ( + self.seq_length + (self.attention_window - self.seq_length % self.attention_window) % self.attention_window + ) + + def prepare_config_and_inputs_for_common(self): + input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size) + eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1) + input_ids = tf.concat([input_ids, eos_tensor], axis=1) + + decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + config = self.config_cls( + vocab_size=self.vocab_size, + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + eos_token_ids=[2], + bos_token_id=self.bos_token_id, + pad_token_id=self.pad_token_id, + decoder_start_token_id=self.pad_token_id, + attention_window=self.attention_window, + **self.config_updates, + ) + inputs_dict = prepare_led_inputs_dict(config, input_ids, decoder_input_ids) + global_attention_mask = tf.concat( + [tf.zeros_like(input_ids)[:, :-1], tf.ones_like(input_ids)[:, -1:]], + axis=-1, + ) + inputs_dict["global_attention_mask"] = global_attention_mask + return config, inputs_dict + + def check_decoder_model_past_large_inputs(self, config, inputs_dict): + model = TFLEDModel(config=config).get_decoder() + input_ids = inputs_dict["input_ids"] + + input_ids = input_ids[:1, :] + attention_mask = inputs_dict["attention_mask"][:1, :] + self.batch_size = 1 + + # first forward pass + outputs = model(input_ids, attention_mask=attention_mask, use_cache=True) + + output, past_key_values = outputs.to_tuple() + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8) + + # append to next input_ids and + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1) + + output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0] + output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0] + + self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1]) + + # select random slice + random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1])) + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx] + output_from_past_slice = output_from_past[:, :, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3) + + +def prepare_led_inputs_dict( + config, + input_ids, + decoder_input_ids, + attention_mask=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, +): + if attention_mask is None: + attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8) + if decoder_attention_mask is None: + decoder_attention_mask = tf.concat( + [ + tf.ones(decoder_input_ids[:, :1].shape, dtype=tf.int8), + tf.cast(tf.math.not_equal(decoder_input_ids[:, 1:], config.pad_token_id), tf.int8), + ], + axis=-1, + ) + if head_mask is None: + head_mask = tf.ones((config.encoder_layers, config.encoder_attention_heads)) + if decoder_head_mask is None: + decoder_head_mask = tf.ones((config.decoder_layers, config.decoder_attention_heads)) + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": decoder_attention_mask, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + } + + +@require_tf +class TFLEDModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (TFLEDForConditionalGeneration, TFLEDModel) if is_tf_available() else () + all_generative_model_classes = (TFLEDForConditionalGeneration,) if is_tf_available() else () + pipeline_model_mapping = ( + { + "feature-extraction": TFLEDModel, + "summarization": TFLEDForConditionalGeneration, + "text2text-generation": TFLEDForConditionalGeneration, + "translation": TFLEDForConditionalGeneration, + } + if is_tf_available() + else {} + ) + is_encoder_decoder = True + test_pruning = False + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFLEDModelTester(self) + self.config_tester = ConfigTester(self, config_class=LEDConfig) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_decoder_model_past_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() + self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + inputs_dict["global_attention_mask"] = tf.zeros_like(inputs_dict["attention_mask"]) + num_global_attn_indices = 2 + inputs_dict["global_attention_mask"] = tf.where( + tf.range(self.model_tester.seq_length)[None, :] < num_global_attn_indices, + 1, + inputs_dict["global_attention_mask"], + ) + + config.return_dict = True + seq_length = self.model_tester.seq_length + encoder_seq_length = self.model_tester.encoder_seq_length + + def check_decoder_attentions_output(outputs): + decoder_attentions = outputs.decoder_attentions + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_length, seq_length], + ) + + def check_encoder_attentions_output(outputs): + attentions = [t.numpy() for t in outputs.encoder_attentions] + global_attentions = [t.numpy() for t in outputs.encoder_global_attentions] + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertEqual(len(global_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, seq_length, seq_length], + ) + self.assertListEqual( + list(global_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, num_global_attn_indices], + ) + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["use_cache"] = False + config.output_hidden_states = False + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + out_len = len(outputs) + self.assertEqual(config.output_hidden_states, False) + check_encoder_attentions_output(outputs) + + if self.is_encoder_decoder: + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(config.output_hidden_states, False) + check_decoder_attentions_output(outputs) + + # Check that output attentions can also be changed via the config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + self.assertEqual(config.output_hidden_states, False) + check_encoder_attentions_output(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + config.output_hidden_states = True + model = model_class(config) + outputs = model(self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) + self.assertEqual(model.config.output_hidden_states, True) + check_encoder_attentions_output(outputs) + + @unittest.skip("LED keeps using potentially symbolic tensors in conditionals and breaks tracing.") + def test_saved_model_creation(self): + pass + + def test_generate_with_headmasking(self): + # TODO: Head-masking not yet implement + pass + + +def _long_tensor(tok_lst): + return tf.constant(tok_lst, dtype=tf.int32) + + +TOLERANCE = 1e-4 + + +@slow +@require_tf +class TFLEDModelIntegrationTest(unittest.TestCase): + def test_inference_no_head(self): + model = TFLEDForConditionalGeneration.from_pretrained("allenai/led-base-16384").led + + # change to intended input here + input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]]) + decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]]) + inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids) + output = model(**inputs_dict)[0] + expected_shape = (1, 1024, 768) + self.assertEqual(output.shape, expected_shape) + # change to expected output here + expected_slice = tf.convert_to_tensor( + [[2.3050, 2.8279, 0.6531], [-1.8457, -0.1455, -3.5661], [-1.0186, 0.4586, -2.2043]], + ) + tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-3) + + def test_inference_with_head(self): + model = TFLEDForConditionalGeneration.from_pretrained("allenai/led-base-16384") + + # change to intended input here + input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]]) + decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]]) + inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids) + output = model(**inputs_dict)[0] + expected_shape = (1, 1024, model.config.vocab_size) + self.assertEqual(output.shape, expected_shape) + # change to expected output here + expected_slice = tf.convert_to_tensor( + [[33.6507, 6.4572, 16.8089], [5.8739, -2.4238, 11.2902], [-3.2139, -4.3149, 4.2783]], + ) + tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-3, rtol=1e-3) diff --git a/docs/transformers/tests/models/led/test_tokenization_led.py b/docs/transformers/tests/models/led/test_tokenization_led.py new file mode 100644 index 0000000000000000000000000000000000000000..a50acac048d0020b8aad19f3c57317bab28c6c04 --- /dev/null +++ b/docs/transformers/tests/models/led/test_tokenization_led.py @@ -0,0 +1,196 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import os +import unittest +from functools import lru_cache + +from transformers import BatchEncoding, LEDTokenizer, LEDTokenizerFast +from transformers.models.led.tokenization_led import VOCAB_FILES_NAMES +from transformers.testing_utils import require_tokenizers, require_torch +from transformers.utils import cached_property + +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible + + +@require_tokenizers +class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "allenai/led-base-16384" + tokenizer_class = LEDTokenizer + rust_tokenizer_class = LEDTokenizerFast + test_rust_tokenizer = True + + @classmethod + def setUpClass(cls): + super().setUpClass() + + vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "\u0120", + "\u0120l", + "\u0120n", + "\u0120lo", + "\u0120low", + "er", + "\u0120lowest", + "\u0120newer", + "\u0120wider", + "", + ] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] + cls.special_tokens_map = {"unk_token": ""} + + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + with open(cls.merges_file, "w", encoding="utf-8") as fp: + fp.write("\n".join(merges)) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + def get_input_output_texts(self, tokenizer): + return "lower newer", "lower newer" + + @cached_property + def default_tokenizer(self): + return LEDTokenizer.from_pretrained("allenai/led-base-16384") + + @cached_property + def default_tokenizer_fast(self): + return LEDTokenizerFast.from_pretrained("allenai/led-base-16384") + + @require_torch + def test_prepare_batch(self): + src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] + expected_src_tokens = [0, 250, 251, 17818, 13, 39186, 1938, 4, 2] + + for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: + batch = tokenizer(src_text, max_length=len(expected_src_tokens), padding=True, return_tensors="pt") + self.assertIsInstance(batch, BatchEncoding) + + self.assertEqual((2, 9), batch.input_ids.shape) + self.assertEqual((2, 9), batch.attention_mask.shape) + result = batch.input_ids.tolist()[0] + self.assertListEqual(expected_src_tokens, result) + + @require_torch + def test_prepare_batch_empty_target_text(self): + src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] + for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: + batch = tokenizer(src_text, padding=True, return_tensors="pt") + self.assertIn("input_ids", batch) + self.assertIn("attention_mask", batch) + self.assertNotIn("labels", batch) + self.assertNotIn("decoder_attention_mask", batch) + + @require_torch + def test_tokenizer_as_target_length(self): + tgt_text = [ + "Summary of the text.", + "Another summary.", + ] + for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: + targets = tokenizer(text_target=tgt_text, max_length=32, padding="max_length", return_tensors="pt") + self.assertEqual(32, targets["input_ids"].shape[1]) + + @require_torch + def test_prepare_batch_not_longer_than_maxlen(self): + for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: + batch = tokenizer( + ["I am a small frog" * 1024, "I am a small frog"], padding=True, truncation=True, return_tensors="pt" + ) + self.assertIsInstance(batch, BatchEncoding) + self.assertEqual(batch.input_ids.shape, (2, 5122)) + + @require_torch + def test_special_tokens(self): + src_text = ["A long paragraph for summarization."] + tgt_text = [ + "Summary of the text.", + ] + for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: + inputs = tokenizer(src_text, return_tensors="pt") + targets = tokenizer(text_target=tgt_text, return_tensors="pt") + input_ids = inputs["input_ids"] + labels = targets["input_ids"] + self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item()) + self.assertTrue((labels[:, 0] == tokenizer.bos_token_id).all().item()) + self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item()) + self.assertTrue((labels[:, -1] == tokenizer.eos_token_id).all().item()) + + @require_torch + def test_global_attention_mask(self): + for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: + src_text = ["Summary of the text.", "Another summary."] + expected_global_attention_mask = [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, -1, -1]] + + encoded_output = tokenizer(src_text, padding=False) + encoded_output["global_attention_mask"] = [[0] * len(x) for x in encoded_output["input_ids"]] + outputs = tokenizer.pad(encoded_output) + self.assertSequenceEqual(outputs["global_attention_mask"], expected_global_attention_mask) + + @unittest.skip + def test_pretokenized_inputs(self): + pass + + def test_embeded_special_tokens(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + sentence = "A, AllenNLP sentence." + tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) + tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) + self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"])) + self.assertEqual( + sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]), + sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]), + ) + + tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"]) + tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"]) + self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) + self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) + + self.assertSequenceEqual( + tokens_p_str, ["", "A", ",", "", "ĠAllen", "N", "LP", "Ġsentence", ".", ""] + ) + self.assertSequenceEqual( + tokens_r_str, ["", "A", ",", "", "ĠAllen", "N", "LP", "Ġsentence", ".", ""] + ) diff --git a/docs/transformers/tests/models/levit/__init__.py b/docs/transformers/tests/models/levit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/levit/test_image_processing_levit.py b/docs/transformers/tests/models/levit/test_image_processing_levit.py new file mode 100644 index 0000000000000000000000000000000000000000..beb3c77c1521453de17cf7f10f57e44c0359731f --- /dev/null +++ b/docs/transformers/tests/models/levit/test_image_processing_levit.py @@ -0,0 +1,122 @@ +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_vision_available(): + from transformers import LevitImageProcessor + + if is_torchvision_available(): + from transformers import LevitImageProcessorFast + + +class LevitImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + size = size if size is not None else {"shortest_edge": 18} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "do_center_crop": self.do_center_crop, + "size": self.size, + "crop_size": self.crop_size, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.crop_size["height"], self.crop_size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class LevitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = LevitImageProcessor if is_vision_available() else None + fast_image_processing_class = LevitImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = LevitImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "size")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 18}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) + self.assertEqual(image_processor.size, {"shortest_edge": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) diff --git a/docs/transformers/tests/models/levit/test_modeling_levit.py b/docs/transformers/tests/models/levit/test_modeling_levit.py new file mode 100644 index 0000000000000000000000000000000000000000..f6226be1f871a22fc73baec1faad67aff86a6039 --- /dev/null +++ b/docs/transformers/tests/models/levit/test_modeling_levit.py @@ -0,0 +1,411 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch LeViT model.""" + +import unittest +import warnings +from math import ceil, floor + +from transformers import LevitConfig +from transformers.file_utils import cached_property, is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + LevitForImageClassification, + LevitForImageClassificationWithTeacher, + LevitModel, + ) + from transformers.models.auto.modeling_auto import ( + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, + MODEL_MAPPING_NAMES, + ) + + +if is_vision_available(): + from PIL import Image + + from transformers import LevitImageProcessor + + +class LevitConfigTester(ConfigTester): + def create_and_test_config_common_properties(self): + config = self.config_class(**self.inputs_dict) + self.parent.assertTrue(hasattr(config, "hidden_sizes")) + self.parent.assertTrue(hasattr(config, "num_attention_heads")) + + +class LevitModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=64, + num_channels=3, + kernel_size=3, + stride=2, + padding=1, + patch_size=16, + hidden_sizes=[16, 32, 48], + num_attention_heads=[1, 2, 3], + depths=[2, 3, 4], + key_dim=[8, 8, 8], + drop_path_rate=0, + mlp_ratio=[2, 2, 2], + attention_ratio=[2, 2, 2], + initializer_range=0.02, + is_training=True, + use_labels=True, + num_labels=2, # Check + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.hidden_sizes = hidden_sizes + self.num_attention_heads = num_attention_heads + self.depths = depths + self.key_dim = key_dim + self.drop_path_rate = drop_path_rate + self.patch_size = patch_size + self.attention_ratio = attention_ratio + self.mlp_ratio = mlp_ratio + self.initializer_range = initializer_range + self.down_ops = [ + ["Subsample", key_dim[0], hidden_sizes[0] // key_dim[0], 4, 2, 2], + ["Subsample", key_dim[0], hidden_sizes[1] // key_dim[0], 4, 2, 2], + ] + self.is_training = is_training + self.use_labels = use_labels + self.num_labels = num_labels + self.initializer_range = initializer_range + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size], self.num_labels) + + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return LevitConfig( + image_size=self.image_size, + num_channels=self.num_channels, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + patch_size=self.patch_size, + hidden_sizes=self.hidden_sizes, + num_attention_heads=self.num_attention_heads, + depths=self.depths, + key_dim=self.key_dim, + drop_path_rate=self.drop_path_rate, + mlp_ratio=self.mlp_ratio, + attention_ratio=self.attention_ratio, + initializer_range=self.initializer_range, + down_ops=self.down_ops, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = LevitModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + image_size = (self.image_size, self.image_size) + height, width = image_size[0], image_size[1] + for _ in range(4): + height = floor(((height + 2 * self.padding - self.kernel_size) / self.stride) + 1) + width = floor(((width + 2 * self.padding - self.kernel_size) / self.stride) + 1) + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.batch_size, ceil(height / 4) * ceil(width / 4), self.hidden_sizes[-1]), + ) + + def create_and_check_for_image_classification(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = LevitForImageClassification(config) + model.to(torch_device) + model.eval() + result = model(pixel_values, labels=labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class LevitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as Levit does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = ( + (LevitModel, LevitForImageClassification, LevitForImageClassificationWithTeacher) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "image-feature-extraction": LevitModel, + "image-classification": (LevitForImageClassification, LevitForImageClassificationWithTeacher), + } + if is_torch_available() + else {} + ) + + test_pruning = False + test_torchscript = False + test_resize_embeddings = False + test_head_masking = False + has_attentions = False + + def setUp(self): + self.model_tester = LevitModelTester(self) + self.config_tester = ConfigTester( + self, config_class=LevitConfig, has_text_modality=False, common_properties=["image_size", "num_channels"] + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="Levit does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Levit does not support input and output embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="Levit does not output attentions") + def test_attention_outputs(self): + pass + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = len(self.model_tester.depths) + 1 + self.assertEqual(len(hidden_states), expected_num_layers) + + image_size = (self.model_tester.image_size, self.model_tester.image_size) + height, width = image_size[0], image_size[1] + for _ in range(4): + height = floor( + ( + (height + 2 * self.model_tester.padding - self.model_tester.kernel_size) + / self.model_tester.stride + ) + + 1 + ) + width = floor( + ( + (width + 2 * self.model_tester.padding - self.model_tester.kernel_size) + / self.model_tester.stride + ) + + 1 + ) + # verify the first hidden states (first block) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [ + height * width, + self.model_tester.hidden_sizes[0], + ], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "LevitForImageClassificationWithTeacher": + del inputs_dict["labels"] + + return inputs_dict + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_image_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_image_classification(*config_and_inputs) + + # special case for LevitForImageClassificationWithTeacher model + def test_training(self): + if not self.model_tester.is_training: + self.skipTest(reason="model_tester.is_training is set to False") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + # LevitForImageClassificationWithTeacher supports inference-only + if ( + model_class.__name__ in MODEL_MAPPING_NAMES.values() + or model_class.__name__ == "LevitForImageClassificationWithTeacher" + ): + continue + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + if not self.model_tester.is_training: + self.skipTest(reason="model_tester.is_training is set to False") + + config.use_cache = False + config.return_dict = True + + for model_class in self.all_model_classes: + if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing: + continue + # LevitForImageClassificationWithTeacher supports inference-only + if model_class.__name__ == "LevitForImageClassificationWithTeacher": + continue + model = model_class(config) + model.gradient_checkpointing_enable() + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_problem_types(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + problem_types = [ + {"title": "multi_label_classification", "num_labels": 2, "dtype": torch.float}, + {"title": "single_label_classification", "num_labels": 1, "dtype": torch.long}, + {"title": "regression", "num_labels": 1, "dtype": torch.float}, + ] + + for model_class in self.all_model_classes: + if ( + model_class.__name__ + not in [ + *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES.values(), + ] + or model_class.__name__ == "LevitForImageClassificationWithTeacher" + ): + continue + + for problem_type in problem_types: + with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"): + config.problem_type = problem_type["title"] + config.num_labels = problem_type["num_labels"] + + model = model_class(config) + model.to(torch_device) + model.train() + + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + + if problem_type["num_labels"] > 1: + inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"]) + + inputs["labels"] = inputs["labels"].to(problem_type["dtype"]) + + # This tests that we do not trigger the warning form PyTorch "Using a target size that is different + # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure + # they have the same size." which is a symptom something in wrong for the regression problem. + # See https://github.com/huggingface/transformers/issues/11780 + with warnings.catch_warnings(record=True) as warning_list: + loss = model(**inputs).loss + for w in warning_list: + if "Using a target size that is different to the input size" in str(w.message): + raise ValueError( + f"Something is going wrong in the regression problem: intercepted {w.message}" + ) + + loss.backward() + + @slow + def test_model_from_pretrained(self): + model_name = "facebook/levit-128S" + model = LevitModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +class LevitModelIntegrationTest(unittest.TestCase): + @cached_property + def default_image_processor(self): + return LevitImageProcessor.from_pretrained("facebook/levit-128S") + + @slow + def test_inference_image_classification_head(self): + model = LevitForImageClassificationWithTeacher.from_pretrained("facebook/levit-128S").to(torch_device) + + image_processor = self.default_image_processor + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + # verify the logits + expected_shape = torch.Size((1, 1000)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = torch.tensor([1.0448, -0.3745, -1.8317]).to(torch_device) + + torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4) diff --git a/docs/transformers/tests/models/lilt/__init__.py b/docs/transformers/tests/models/lilt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/lilt/test_modeling_lilt.py b/docs/transformers/tests/models/lilt/test_modeling_lilt.py new file mode 100644 index 0000000000000000000000000000000000000000..949649a503dfdef94d557305d47200acf11d13dd --- /dev/null +++ b/docs/transformers/tests/models/lilt/test_modeling_lilt.py @@ -0,0 +1,327 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import LiltConfig, is_torch_available +from transformers.testing_utils import require_torch, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + LiltForQuestionAnswering, + LiltForSequenceClassification, + LiltForTokenClassification, + LiltModel, + ) + + +class LiltModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=24, + num_hidden_layers=2, + num_attention_heads=6, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + scope=None, + range_bbox=1000, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.scope = scope + self.range_bbox = range_bbox + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox) + # Ensure that bbox is legal + for i in range(bbox.shape[0]): + for j in range(bbox.shape[1]): + if bbox[i, j, 3] < bbox[i, j, 1]: + t = bbox[i, j, 3] + bbox[i, j, 3] = bbox[i, j, 1] + bbox[i, j, 1] = t + if bbox[i, j, 2] < bbox[i, j, 0]: + t = bbox[i, j, 2] + bbox[i, j, 2] = bbox[i, j, 0] + bbox[i, j, 0] = t + + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + + config = self.get_config() + + return config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels + + def get_config(self): + return LiltConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + ) + + def create_and_check_model( + self, + config, + input_ids, + bbox, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + ): + model = LiltModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, bbox=bbox, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, bbox=bbox, token_type_ids=token_type_ids) + result = model(input_ids, bbox=bbox) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_for_token_classification( + self, + config, + input_ids, + bbox, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + ): + config.num_labels = self.num_labels + model = LiltForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, bbox=bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels + ) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_question_answering( + self, + config, + input_ids, + bbox, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + ): + model = LiltForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + bbox=bbox, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + bbox, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + ) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "bbox": bbox, + "token_type_ids": token_type_ids, + "attention_mask": input_mask, + } + return config, inputs_dict + + +@require_torch +class LiltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + LiltModel, + LiltForSequenceClassification, + LiltForTokenClassification, + LiltForQuestionAnswering, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": LiltModel, + "question-answering": LiltForQuestionAnswering, + "text-classification": LiltForSequenceClassification, + "token-classification": LiltForTokenClassification, + "zero-shot": LiltForSequenceClassification, + } + if is_torch_available() + else {} + ) + fx_compatible = False + test_pruning = False + + # TODO: Fix the failed tests + def is_pipeline_test_to_skip( + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, + ): + return True + + def setUp(self): + self.model_tester = LiltModelTester(self) + self.config_tester = ConfigTester(self, config_class=LiltConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "SCUT-DLVCLab/lilt-roberta-en-base" + model = LiltModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_torch +@slow +class LiltModelIntegrationTest(unittest.TestCase): + def test_inference_no_head(self): + model = LiltModel.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base").to(torch_device) + + input_ids = torch.tensor([[1, 2]], device=torch_device) + bbox = torch.tensor([[[1, 2, 3, 4], [5, 6, 7, 8]]], device=torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(input_ids=input_ids, bbox=bbox) + + expected_shape = torch.Size([1, 2, 768]) + expected_slice = torch.tensor( + [[-0.0653, 0.0950, -0.0061], [-0.0545, 0.0926, -0.0324]], + device=torch_device, + ) + + self.assertTrue(outputs.last_hidden_state.shape, expected_shape) + torch.testing.assert_close(outputs.last_hidden_state[0, :, :3], expected_slice, rtol=1e-3, atol=1e-3) diff --git a/docs/transformers/tests/models/llama/__init__.py b/docs/transformers/tests/models/llama/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/llama/test_modeling_flax_llama.py b/docs/transformers/tests/models/llama/test_modeling_flax_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..7091dadf5826f7683d02b86ca04de38f5c95cd15 --- /dev/null +++ b/docs/transformers/tests/models/llama/test_modeling_flax_llama.py @@ -0,0 +1,259 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers import LlamaConfig, is_flax_available, is_tokenizers_available +from transformers.testing_utils import require_flax, slow + +from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor + + +if is_flax_available(): + import jax.numpy as jnp + + from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel + + +if is_tokenizers_available(): + from transformers import LlamaTokenizerFast + + +class FlaxLlamaModelTester: + def __init__( + self, + parent, + batch_size=2, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=2, + intermediate_size=64, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + window_size=7, + initializer_range=0.02, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.window_size = window_size + self.initializer_range = initializer_range + self.scope = None + self.bos_token_id = vocab_size - 1 + self.eos_token_id = vocab_size - 1 + self.pad_token_id = vocab_size - 1 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = np.tril(np.ones((self.batch_size, self.seq_length))) + + config = LlamaConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + use_cache=True, + is_decoder=False, + initializer_range=self.initializer_range, + ) + + return (config, input_ids, input_mask) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + return config, inputs_dict + + def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask): + max_decoder_length = 20 + model = model_class_name(config) + + past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length) + attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4") + + position_ids = jnp.broadcast_to( + jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1) + ) + outputs_cache = model( + input_ids[:, :-1], + attention_mask=attention_mask, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4") + outputs_cache_next = model( + input_ids[:, -1:], + attention_mask=attention_mask, + past_key_values=outputs_cache.past_key_values, + position_ids=position_ids, + ) + + outputs = model(input_ids) + + diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])) + self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}") + + def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask): + max_decoder_length = 20 + model = model_class_name(config) + + attention_mask_cache = jnp.concatenate( + [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))], + axis=-1, + ) + + past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length) + position_ids = jnp.broadcast_to( + jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1) + ) + + outputs_cache = model( + input_ids[:, :-1], + attention_mask=attention_mask_cache, + past_key_values=past_key_values, + position_ids=position_ids, + ) + position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4") + outputs_cache_next = model( + input_ids[:, -1:], + past_key_values=outputs_cache.past_key_values, + attention_mask=attention_mask_cache, + position_ids=position_ids, + ) + + outputs = model(input_ids, attention_mask=attention_mask) + + diff = np.max(np.abs(outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])) + self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}") + + +@require_flax +class FlaxLlamaModelTest(FlaxModelTesterMixin, unittest.TestCase): + all_model_classes = (FlaxLlamaModel, FlaxLlamaForCausalLM) if is_flax_available() else () + + def setUp(self): + self.model_tester = FlaxLlamaModelTester(self) + + def test_use_cache_forward(self): + for model_class_name in self.all_model_classes: + config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask) + + def test_use_cache_forward_with_attn_mask(self): + for model_class_name in self.all_model_classes: + config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + self.model_tester.check_use_cache_forward_with_attn_mask( + model_class_name, config, input_ids, attention_mask + ) + + @slow + def test_model_from_pretrained(self): + for model_class_name in self.all_model_classes: + model = model_class_name.from_pretrained("openlm-research/open_llama_3b_v2", from_pt=True) + outputs = model(np.ones((1, 1))) + self.assertIsNotNone(outputs) + + +@slow +@require_flax +class FlaxLlamaIntegrationTest(unittest.TestCase): + def setUp(self): + self.model_id = "openlm-research/open_llama_3b_v2" + self.model = FlaxLlamaForCausalLM.from_pretrained(self.model_id, from_pt=True) + self.test_batch = jnp.arange(32).reshape(4, 8) + 1911 + + def test_model_logits(self): + flax_logits = self.model(self.test_batch).logits + + # fmt: off + EXPECTED_LOGITS = [-74.4243, -74.0680, -65.2507, -79.1658, -77.7460, -69.2379, -86.4588, -84.8933, -77.8456] + EXPECTED_MIN, EXPECTED_MAX, EXPECTED_MEAN = -96.9952 + EXPECTED_MAX = -18.4571 + EXPECTED_MEAN = -65.0608 + # fmt: on + + self.assertTrue(np.allclose(flax_logits[0, :3, :3].flatten(), EXPECTED_LOGITS, atol=1e-4)) + self.assertAlmostEqual(flax_logits.min(), EXPECTED_MIN, places=3) + self.assertAlmostEqual(flax_logits.max(), EXPECTED_MAX, places=3) + self.assertAlmostEqual(flax_logits.mean(), EXPECTED_MEAN, places=3) + + def test_model_hidden_states(self): + flax_hidden_states = self.model(self.test_batch, output_hidden_states=True).hidden_states + flax_hidden_means = [h.mean() for h in flax_hidden_states] + + # fmt: off + EXPECTED_HIDDEN_MEANS = [ + -0.00007,-0.00049,-0.00169,-0.00253,-0.00271, + -0.00290,-0.00252,0.00230,0.00230,0.00198, + 0.00196,0.00174,0.00246,0.00205,0.00242, + 0.00171,0.00092,0.00054,0.00102,0.00024, + 0.00029,0.00037,-0.00101,-0.00062,-0.00341,-0.00636,-0.00357 + ] + # fmt: on + + self.assertTrue(np.allclose(flax_hidden_means, EXPECTED_HIDDEN_MEANS, atol=1e-4)) + + def test_generated_text(self): + tokenizer = LlamaTokenizerFast.from_pretrained(self.model_id) + tokenizer.pad_token_id = 2 + test_batch = ["Aloha, World! ", "2 + 2 = ", "Paris is the capital of ", "我很高興認識"] + + inputs = tokenizer(test_batch, return_tensors="np", truncation=True, padding=True) + generated_ids = self.model.generate(**inputs, max_length=15).sequences + generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + + # fmt: off + EXPECTED_GENERATION = [ + "Aloha, World! 201", + "2 + 2 = 4\n2", + "Paris is the capital of Île-", + "我很高興認識你,我" + ] + # fmt: on + + self.assertListEqual(generated_text, EXPECTED_GENERATION) diff --git a/docs/transformers/tests/models/llama/test_modeling_llama.py b/docs/transformers/tests/models/llama/test_modeling_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b86c128cae799247fa9798332c2435fc2909fe --- /dev/null +++ b/docs/transformers/tests/models/llama/test_modeling_llama.py @@ -0,0 +1,942 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch LLaMA model.""" + +import unittest + +from packaging import version +from parameterized import parameterized + +from transformers import AutoTokenizer, LlamaConfig, StaticCache, is_torch_available, set_seed +from transformers.generation.configuration_utils import GenerationConfig +from transformers.testing_utils import ( + Expectations, + cleanup, + require_read_token, + require_torch, + require_torch_accelerator, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import ( + LlamaForCausalLM, + LlamaForQuestionAnswering, + LlamaForSequenceClassification, + LlamaForTokenClassification, + LlamaModel, + LlamaTokenizer, + ) + from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding + + +class LlamaModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return LlamaConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + is_decoder=False, + initializer_range=self.initializer_range, + pad_token_id=self.pad_token_id, + ) + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = LlamaModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = ( + ( + LlamaModel, + LlamaForCausalLM, + LlamaForSequenceClassification, + LlamaForQuestionAnswering, + LlamaForTokenClassification, + ) + if is_torch_available() + else () + ) + pipeline_model_mapping = ( + { + "feature-extraction": LlamaModel, + "text-classification": LlamaForSequenceClassification, + "text-generation": LlamaForCausalLM, + "zero-shot": LlamaForSequenceClassification, + "question-answering": LlamaForQuestionAnswering, + "token-classification": LlamaForTokenClassification, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + fx_compatible = False # Broken by attention refactor cc @Cyrilvallez + + # Need to use `0.8` instead of `0.9` for `test_cpu_offload` + # This is because we are hitting edge cases with the causal_mask buffer + model_split_percents = [0.5, 0.7, 0.8] + + # used in `test_torch_compile_for_training` + _torch_compile_train_cls = LlamaForCausalLM if is_torch_available() else None + + def setUp(self): + self.model_tester = LlamaModelTester(self) + self.config_tester = ConfigTester(self, config_class=LlamaConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_llama_sequence_classification_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = LlamaForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_llama_sequence_classification_model_for_single_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "single_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) + model = LlamaForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_llama_sequence_classification_model_for_multi_label(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + config.problem_type = "multi_label_classification" + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + sequence_labels = ids_tensor( + [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size + ).to(torch.float) + model = LlamaForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) + self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) + + def test_llama_token_classification_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.num_labels = 3 + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels) + model = LlamaForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=attention_mask, labels=token_labels) + self.assertEqual( + result.logits.shape, + (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels), + ) + + @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) + def test_model_rope_scaling_from_config(self, scaling_type): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + short_input = ids_tensor([1, 10], config.vocab_size) + long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + original_model = LlamaModel(config) + original_model.to(torch_device) + original_model.eval() + original_short_output = original_model(short_input).last_hidden_state + original_long_output = original_model(long_input).last_hidden_state + + set_seed(42) # Fixed seed at init time so the two models get the same random weights + config.rope_scaling = {"type": scaling_type, "factor": 10.0} + scaled_model = LlamaModel(config) + scaled_model.to(torch_device) + scaled_model.eval() + scaled_short_output = scaled_model(short_input).last_hidden_state + scaled_long_output = scaled_model(long_input).last_hidden_state + + # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original + # maximum sequence length, so the outputs for the short input should match. + if scaling_type == "dynamic": + torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) + else: + self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) + + # The output should be different for long inputs + self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + + def test_model_rope_scaling(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + + # Inputs + x = torch.randn( + 1, dtype=torch.float32, device=torch_device + ) # used exclusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Sanity check original RoPE + original_rope = LlamaRotaryEmbedding(config=config).to(torch_device) + original_cos_short, original_sin_short = original_rope(x, position_ids_short) + original_cos_long, original_sin_long = original_rope(x, position_ids_long) + torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) + + # Sanity check linear RoPE scaling + # New position "x" should match original position with index "x/scaling_factor" + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + linear_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) + linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) + linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) + torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) + for new_position in range(0, long_input_length, scaling_factor): + original_position = int(new_position // scaling_factor) + torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) + torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) + + # Sanity check Dynamic NTK RoPE scaling + # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase + # with scaling_factor (or that `inv_freq` decreases) + config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} + ntk_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) + ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) + ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) + torch.testing.assert_close(ntk_cos_short, original_cos_short) + torch.testing.assert_close(ntk_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_sin_long, original_sin_long) + self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) + + # Sanity check Yarn RoPE scaling + # Scaling should be over the entire input + config.rope_scaling = {"type": "yarn", "factor": scaling_factor} + yarn_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) + yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) + yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) + torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_short, original_cos_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_long, original_sin_long) + + def test_model_loading_old_rope_configs(self): + def _reinitialize_config(base_config, new_kwargs): + # Reinitialize the config with the new kwargs, forcing the config to go through its __init__ validation + # steps. + base_config_dict = base_config.to_dict() + new_config = LlamaConfig.from_dict(config_dict={**base_config_dict, **new_kwargs}) + return new_config + + # from untouched config -> ✅ + base_config, model_inputs = self.model_tester.prepare_config_and_inputs_for_common() + original_model = LlamaForCausalLM(base_config).to(torch_device) + original_model(**model_inputs) + + # from a config with the expected rope configuration -> ✅ + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0}}) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with the old rope configuration ('type' instead of 'rope_type') -> ✅ we gracefully handle BC + config = _reinitialize_config(base_config, {"rope_scaling": {"type": "linear", "factor": 10.0}}) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with both 'type' and 'rope_type' -> ✅ they can coexist (and both are present in the config) + config = _reinitialize_config( + base_config, {"rope_scaling": {"type": "linear", "rope_type": "linear", "factor": 10.0}} + ) + self.assertTrue(config.rope_scaling["type"] == "linear") + self.assertTrue(config.rope_scaling["rope_type"] == "linear") + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with parameters in a bad range ('factor' should be >= 1.0) -> ⚠️ throws a warning + with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": -999.0}}) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + self.assertEqual(len(logs.output), 1) + self.assertIn("factor field", logs.output[0]) + + # from a config with unknown parameters ('foo' isn't a rope option) -> ⚠️ throws a warning + with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: + config = _reinitialize_config( + base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0, "foo": "bar"}} + ) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + self.assertEqual(len(logs.output), 1) + self.assertIn("Unrecognized keys", logs.output[0]) + + # from a config with specific rope type but missing one of its mandatory parameters -> ❌ throws exception + with self.assertRaises(KeyError): + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor" + + +@require_torch_accelerator +class LlamaIntegrationTest(unittest.TestCase): + def tearDown(self): + # TODO (joao): automatic compilation, i.e. compilation when `cache_implementation="static"` is used, leaves + # some memory allocated in the cache, which means some object is not being released properly. This causes some + # unoptimal memory usage, e.g. after certain tests a 7B model in FP16 no longer fits in a 24GB GPU. + # Investigate the root cause. + cleanup(torch_device, gc_collect=False) + + @slow + @require_read_token + def test_llama_3_1_hard(self): + """ + An integration test for llama 3.1. It tests against a long output to ensure the subtle numerical differences + from llama 3.1.'s RoPE can be detected + """ + # diff on `EXPECTED_TEXT`: + # 2024-08-26: updating from torch 2.3.1 to 2.4.0 slightly changes the results. + EXPECTED_TEXT = ( + "Tell me about the french revolution. The french revolution was a period of radical political and social " + "upheaval in France that lasted from 1789 until 1799. It was a time of great change and upheaval, marked " + "by the overthrow of the monarchy, the rise of the middle class, and the eventual establishment of the " + "First French Republic.\nThe revolution began in 1789 with the Estates-General, a representative " + "assembly that had not met since 1614. The Third Estate, which represented the common people, " + "demanded greater representation and eventually broke away to form the National Assembly. This marked " + "the beginning of the end of the absolute monarchy and the rise of the middle class.\n" + ) + + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct") + model = LlamaForCausalLM.from_pretrained( + "meta-llama/Meta-Llama-3.1-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16 + ) + input_text = ["Tell me about the french revolution."] + model_inputs = tokenizer(input_text, return_tensors="pt").to(model.device) + + generated_ids = model.generate(**model_inputs, max_new_tokens=128, do_sample=False) + generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) + self.assertEqual(generated_text, EXPECTED_TEXT) + + @slow + @require_read_token + def test_model_7b_logits_bf16(self): + input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338] + + model = LlamaForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager" + ) + + with torch.no_grad(): + out = model(torch.tensor([input_ids]).to(torch_device)) + # Expected mean on dim = -1 + + # fmt: off + expected_means = Expectations( + { + ("xpu", 3): torch.tensor([[-6.5208, -4.1218, -4.9377, -3.2536, 0.8127, -2.9811, 1.2918, -3.3848]]), + ("cuda", 7): torch.tensor([[-6.5061, -4.1147, -4.9669, -3.2038, 0.8069, -2.9694, 1.2864, -3.3786]]), + ("cuda", 8): torch.tensor([[-6.5208, -4.1218, -4.9377, -3.2536, 0.8127, -2.9811, 1.2918, -3.3848]]) + }) + + expected_mean = expected_means.get_expectation() + self.assertTrue( + torch.allclose( + expected_mean.to(torch_device), + out.logits.float().mean(-1), + atol=1e-2, + rtol=1e-2 + ) + ) + + # slicing logits[0, 0, 0:15] + expected_slices = Expectations( + { + ("xpu", 3): torch.tensor([[-12.5625, -7.1250, -0.6289, -7.8750, -6.9688, -7.8125, -6.5000, -7.4375, -7.6562, -6.9688, -6.0312, -7.0312, -1.8203, 1.8750, -8.5000]]), + ("cuda", 7): torch.tensor([[-12.5000, -7.0625, -0.6289, -7.8750, -6.9688, -7.8125, -6.4688, -7.4375, -7.6875, -6.9375, -6.0312, -7.0000, -1.8594, 1.8438, -8.5000]]), + ("cuda", 8): torch.tensor([[-12.5625, -7.1250, -0.6289, -7.8750, -6.9688, -7.8125, -6.5000, -7.4375, -7.6562, -6.9688, -6.0312, -7.0312, -1.8203, 1.8750, -8.5000]]) + }) + # fmt: on + expected_slice = expected_slices.get_expectation() + self.assertTrue( + torch.allclose( + expected_slice.to(torch_device), + out.logits[0, 0, :15].float(), + atol=1e-2, + rtol=1e-2, + ) + ) + + @slow + @require_read_token + def test_model_7b_logits(self): + input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338] + + model = LlamaForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16 + ) + + with torch.no_grad(): + out = model(torch.tensor([input_ids]).to(torch_device)) + + # fmt: off + # Expected mean on dim = -1 + expected_means = Expectations( + { + ("xpu", 3): torch.tensor([[-6.6544, -4.1259, -4.9840, -3.2456, 0.8261, -3.0124, 1.2971, -3.3641]]), + ("cuda", 7): torch.tensor([[-6.6420, -4.1227, -4.9809, -3.2041, 0.8261, -3.0052, 1.2957, -3.3648]]), + ("cuda", 8): torch.tensor([[-6.6544, -4.1259, -4.9840, -3.2456, 0.8261, -3.0124, 1.2971, -3.3641]]), + }) + + expected_mean = expected_means.get_expectation() + self.assertTrue( + torch.allclose( + expected_mean.to(torch_device), + out.logits.float().mean(-1), + atol=1e-2, + rtol=1e-2 + ) + ) + + # slicing logits[0, 0, 0:15] + expected_slices = Expectations( + { + ("xpu", 3): torch.tensor([-12.8281, -7.4609, -0.4668, -8.0703, -7.2539, -8.0078, -6.4961, -7.7734, -7.8516, -7.0352, -6.2188, -7.1367, -1.8564, 1.9922, -8.6328]), + ("cuda", 7): torch.tensor([-12.8125, -7.3359, -0.4846, -8.0234, -7.2383, -7.9922, -6.4805, -7.7344, -7.8125, -7.0078, -6.1797, -7.1094, -1.8633, 1.9736, -8.6016]), + ("cuda", 8): torch.tensor([-12.8281, -7.4609, -0.4668, -8.0703, -7.2539, -8.0078, -6.4961, -7.7734, -7.8516, -7.0352, -6.2188, -7.1367, -1.8564, 1.9922, -8.6328]) + }) + # fmt: on + + expected_slice = expected_slices.get_expectation() + self.assertTrue( + torch.allclose( + expected_slice.to(torch_device), + out.logits[0, 0, :15].float(), + atol=1e-2, + rtol=1e-2, + ) + ) + + @slow + def test_model_7b_dola_generation(self): + # ground truth text generated with dola_layers="low", repetition_penalty=1.2 + EXPECTED_TEXT_COMPLETION = ( + "Simply put, the theory of relativity states that 1) time and space are relative, and 2) the laws of " + "physics are the same for all observers in uniform motion relative to one another.\n\nThe theory of " + "relativity was developed by Albert Einstein in the early 20th century, and it revolutionized our " + "understanding of space and time." + ) + prompt = "Simply put, the theory of relativity states that " + tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") + model = LlamaForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-chat-hf", device_map="sequential", torch_dtype=torch.float16 + ) + model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + + # greedy generation outputs + generated_ids = model.generate( + **model_inputs, max_new_tokens=64, top_p=None, temperature=1, do_sample=False, dola_layers="low" + ) + text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, text) + + @slow + @require_torch_accelerator + @require_read_token + def test_compile_static_cache(self): + # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2 + # work as intended. See https://github.com/pytorch/pytorch/issues/121943 + if version.parse(torch.__version__) < version.parse("2.3.0"): + self.skipTest(reason="This test requires torch >= 2.3 to run.") + + NUM_TOKENS_TO_GENERATE = 40 + # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test + # was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs. + EXPECTED_TEXT_COMPLETION = [ + "Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial " + "reference frames, and 2) the laws of physics are the same for all inertial reference frames.\nThe " + "theory of relativ", + "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, " + "my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p", + ] + + prompts = [ + "Simply put, the theory of relativity states that ", + "My favorite all time favorite condiment is ketchup.", + ] + tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="", padding_side="right") + model = LlamaForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", device_map=torch_device, torch_dtype=torch.float16 + ) + inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + + # Dynamic Cache + generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False) + dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, dynamic_text) + + # Static Cache + compile (`generate()` internally compiles each decoding step when static cache is used) + generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static" + ) + static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text) + + @slow + @require_read_token + def test_export_static_cache(self): + if version.parse(torch.__version__) < version.parse("2.4.0"): + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + from transformers.integrations.executorch import ( + TorchExportableModuleWithStaticCache, + convert_and_export_with_cache, + ) + + llama_models = { + "meta-llama/Llama-3.2-1B": [ + "Simply put, the theory of relativity states that 1) the speed of light is the same for all " + "observers, regardless of their location, and 2) the laws of physics are the same for all observers" + ], + } + + for llama_model_ckp, EXPECTED_TEXT_COMPLETION in llama_models.items(): + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(llama_model_ckp, pad_token="", padding_side="right") + max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[ + "input_ids" + ].shape[-1] + + # Load model + device = "cpu" + dtype = torch.bfloat16 + cache_implementation = "static" + attn_implementation = "sdpa" + batch_size = 1 + model = LlamaForCausalLM.from_pretrained( + llama_model_ckp, + device_map=device, + torch_dtype=dtype, + attn_implementation=attn_implementation, + generation_config=GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=max_generation_length, + cache_config={ + "batch_size": batch_size, + "max_cache_len": max_generation_length, + "device": device, + }, + ), + ) + + prompts = ["Simply put, the theory of relativity states that "] + prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + prompt_token_ids = prompt_tokens["input_ids"] + max_new_tokens = max_generation_length - prompt_token_ids.shape[-1] + + # Static Cache + export + exported_program = convert_and_export_with_cache(model) + ep_generated_ids = TorchExportableModuleWithStaticCache.generate( + exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens + ) + ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text) + + +@slow +@require_torch_accelerator +class Mask4DTestHard(unittest.TestCase): + def tearDown(self): + cleanup(torch_device, gc_collect=True) + + def setUp(self): + cleanup(torch_device, gc_collect=True) + model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + self.model_dtype = torch.float32 + self.tokenizer = LlamaTokenizer.from_pretrained(model_name) + self.model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=self.model_dtype).to(torch_device) + + def get_test_data(self): + template = "my favorite {}" + items = ("pet is a", "artist plays a", "name is L") # same number of tokens in each item + + batch_separate = [template.format(x) for x in items] # 3 separate lines + batch_shared_prefix = template.format(" ".join(items)) # 1 line with options concatenated + + input_ids = self.tokenizer(batch_separate, return_tensors="pt").input_ids.to(torch_device) + input_ids_shared_prefix = self.tokenizer(batch_shared_prefix, return_tensors="pt").input_ids.to(torch_device) + + mask_shared_prefix = torch.tensor( + [ + [ + [ + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0], + [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0], + [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1], + ] + ] + ], + device=torch_device, + ) + + position_ids = torch.arange(input_ids.shape[1]).tile(input_ids.shape[0], 1).to(torch_device) + + # building custom positions ids based on custom mask + position_ids_shared_prefix = (mask_shared_prefix.sum(dim=-1) - 1).reshape(1, -1) + # effectively: position_ids_shared_prefix = torch.tensor([[0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5]]).to(device) + + # inverting the mask + min_dtype = torch.finfo(self.model_dtype).min + mask_shared_prefix = (mask_shared_prefix.eq(0.0)).to(dtype=self.model_dtype) * min_dtype + + return input_ids, position_ids, input_ids_shared_prefix, mask_shared_prefix, position_ids_shared_prefix + + def test_stacked_causal_mask(self): + ( + input_ids, + position_ids, + input_ids_shared_prefix, + mask_shared_prefix, + position_ids_shared_prefix, + ) = self.get_test_data() + + # regular batch + logits = self.model.forward(input_ids, position_ids=position_ids).logits + logits_last = logits[:, -1, :] # last tokens in each batch line + decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)] + + # single forward run with 4D custom mask + logits_shared_prefix = self.model.forward( + input_ids_shared_prefix, attention_mask=mask_shared_prefix, position_ids=position_ids_shared_prefix + ).logits + logits_shared_prefix_last = logits_shared_prefix[ + 0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1], : + ] # last three tokens + decoded_shared_prefix = [self.tokenizer.decode(t) for t in logits_shared_prefix_last.argmax(dim=-1)] + + self.assertEqual(decoded, decoded_shared_prefix) + + def test_partial_stacked_causal_mask(self): + # Same as the test above, but the input is passed in two groups. It tests that we can pass partial 4D attention masks + + ( + input_ids, + position_ids, + input_ids_shared_prefix, + mask_shared_prefix, + position_ids_shared_prefix, + ) = self.get_test_data() + + # regular batch + logits = self.model.forward(input_ids, position_ids=position_ids).logits + logits_last = logits[:, -1, :] # last tokens in each batch line + decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)] + + # 2 forward runs with custom 4D masks + part_a = 3 # split point + + input_1a = input_ids_shared_prefix[:, :part_a] + position_ids_1a = position_ids_shared_prefix[:, :part_a] + mask_1a = mask_shared_prefix[:, :, :part_a, :part_a] + + outs_1a = self.model.forward(input_1a, attention_mask=mask_1a, position_ids=position_ids_1a) + past_key_values_a = outs_1a["past_key_values"] + + # Case 1: we pass a 4D attention mask regarding the current sequence length (i.e. [..., seq_len, full_len]) + input_1b = input_ids_shared_prefix[:, part_a:] + position_ids_1b = position_ids_shared_prefix[:, part_a:] + mask_1b = mask_shared_prefix[:, :, part_a:, :] + outs_1b = self.model.forward( + input_1b, + attention_mask=mask_1b, + position_ids=position_ids_1b, + past_key_values=past_key_values_a, + ) + decoded_1b = [ + self.tokenizer.decode(t) + for t in outs_1b.logits.argmax(-1)[ + 0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1] - part_a + ] + ] + self.assertEqual(decoded, decoded_1b) + + def test_stacked_causal_mask_static_cache(self): + """same as above but with StaticCache""" + ( + input_ids, + position_ids, + input_ids_shared_prefix, + mask_shared_prefix, + position_ids_shared_prefix, + ) = self.get_test_data() + + # regular batch + logits = self.model.forward(input_ids, position_ids=position_ids).logits + logits_last = logits[:, -1, :] # last tokens in each batch line + decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)] + + # upgrade the model with StaticCache + max_cache_len = 16 # note that max_cache_len is greater than the attention_mask.shape[-1] + past_key_values = StaticCache( + config=self.model.config, + max_batch_size=1, + max_cache_len=max_cache_len, + device=torch_device, + dtype=self.model.dtype, + ) + + padded_attention_mask = torch.nn.functional.pad( + input=mask_shared_prefix, + pad=(0, max_cache_len - mask_shared_prefix.shape[-1]), + mode="constant", + value=torch.finfo(self.model_dtype).min, + ) + + # single forward run with 4D custom mask + logits_shared_prefix = self.model.forward( + input_ids_shared_prefix, + attention_mask=padded_attention_mask, + position_ids=position_ids_shared_prefix, + cache_position=torch.arange(input_ids_shared_prefix.shape[-1], device=torch_device), + past_key_values=past_key_values, + ).logits + logits_shared_prefix_last = logits_shared_prefix[ + 0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1], : + ] # last three tokens + decoded_shared_prefix = [self.tokenizer.decode(t) for t in logits_shared_prefix_last.argmax(dim=-1)] + + self.assertEqual(decoded, decoded_shared_prefix) + + def test_partial_stacked_causal_mask_static_cache(self): + # Same as the test above, but the input is passed in two groups. It tests that we can pass partial 4D attention masks + # we pass a 4D attention mask shaped [..., seq_len, full_static_cache_len]) + ( + input_ids, + position_ids, + input_ids_shared_prefix, + mask_shared_prefix, + position_ids_shared_prefix, + ) = self.get_test_data() + + # regular batch + logits = self.model.forward(input_ids, position_ids=position_ids).logits + logits_last = logits[:, -1, :] # last tokens in each batch line + decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)] + + # upgrade the model with StaticCache + max_cache_len = 16 # note that max_cache_len is greater than the attention_mask.shape[-1] + past_key_values = StaticCache( + config=self.model.config, + max_batch_size=1, + max_cache_len=max_cache_len, + device=torch_device, + dtype=self.model.dtype, + ) + + # forward run for the first part of input + part_a = 3 # split point + + input_1a = input_ids_shared_prefix[:, :part_a] + position_ids_1a = position_ids_shared_prefix[:, :part_a] + mask_1a = mask_shared_prefix[:, :, :part_a, :part_a] + + padded_mask_1a = torch.nn.functional.pad( + input=mask_1a, + pad=(0, max_cache_len - mask_1a.shape[-1]), + mode="constant", + value=torch.finfo(self.model_dtype).min, + ) + + _ = self.model.forward( + input_1a, + attention_mask=padded_mask_1a, + position_ids=position_ids_1a, + cache_position=torch.arange(part_a, device=torch_device), + past_key_values=past_key_values, + ) + + # forward run for the second part of input + input_1b = input_ids_shared_prefix[:, part_a:] + position_ids_1b = position_ids_shared_prefix[:, part_a:] + mask_1b = mask_shared_prefix[:, :, part_a:, :] + + padded_mask_1b = torch.nn.functional.pad( + input=mask_1b, pad=(0, max_cache_len - mask_1b.shape[-1]), mode="constant", value=0 + ) + + outs_1b = self.model.forward( + input_1b, + attention_mask=padded_mask_1b, + position_ids=position_ids_1b, + cache_position=torch.arange( + part_a, + input_ids_shared_prefix.shape[-1], + device=torch_device, + ), + past_key_values=past_key_values, + ) + decoded_1b = [ + self.tokenizer.decode(t) + for t in outs_1b.logits.argmax(-1)[ + 0, torch.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1] - part_a + ] + ] + self.assertEqual(decoded, decoded_1b) diff --git a/docs/transformers/tests/models/llama/test_tokenization_llama.py b/docs/transformers/tests/models/llama/test_tokenization_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..a69ea3948ef3c60cb4169e5ce686ca396ff9ecf8 --- /dev/null +++ b/docs/transformers/tests/models/llama/test_tokenization_llama.py @@ -0,0 +1,914 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pickle +import shutil +import tempfile +import unittest + +from datasets import load_dataset +from huggingface_hub import hf_hub_download + +from transformers import ( + SPIECE_UNDERLINE, + AddedToken, + AutoTokenizer, + LlamaTokenizer, + LlamaTokenizerFast, + PreTrainedTokenizerFast, +) +from transformers.convert_slow_tokenizer import convert_slow_tokenizer +from transformers.testing_utils import ( + get_tests_dir, + nested_simplify, + require_jinja, + require_read_token, + require_sentencepiece, + require_tiktoken, + require_tokenizers, + require_torch, + slow, +) + +from ...test_tokenization_common import TokenizerTesterMixin + + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + + +@require_sentencepiece +@require_tokenizers +class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = ["hf-internal-testing/llama-tokenizer", "meta-llama/Llama-2-7b-hf"] + tokenizer_class = LlamaTokenizer + rust_tokenizer_class = LlamaTokenizerFast + + test_rust_tokenizer = False + test_sentencepiece = True + from_pretrained_kwargs = {} + + @classmethod + def setUpClass(cls): + super().setUpClass() + + # We have a SentencePiece fixture for testing + tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True) + tokenizer.pad_token = tokenizer.eos_token + tokenizer.save_pretrained(cls.tmpdirname) + + def get_tokenizers(self, **kwargs): + kwargs.update({"pad_token": ""}) + return super().get_tokenizers(**kwargs) + + def test_full_tokenizer(self): + tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True) + + tokens = tokenizer.tokenize("This is a test") + self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) + + self.assertListEqual( + tokenizer.convert_tokens_to_ids(tokens), + [285, 46, 10, 170, 382], + ) + + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") + self.assertListEqual( + tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "9", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "é", + ".", + ], + ) + ids = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual( + ids, + [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4], + ) + + back_tokens = tokenizer.convert_ids_to_tokens(ids) + self.assertListEqual( + back_tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "", + ".", + ], + ) + + @unittest.skip(reason="Let's wait for the fast tokenizer!") + def test_save_pretrained(self): + self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {}) + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + the tokenizer.json file for the fast one + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f) + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=True + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it save with the same files + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + # Save tokenizer rust, legacy_format=False + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + + # Checks it saved the tokenizer.json file + self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files)) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + + shutil.rmtree(tmpdirname2) + + @require_torch + def test_batch_tokenization(self): + if not self.test_seq2seq: + self.skipTest(reason="test_seq2seq is set to False") + + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Longer text that will definitely require truncation. + text = [ + " UN Chief Says There Is No Military Solution in Syria", + " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for" + " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons" + " will only worsen the violence and misery for millions of people.", + ] + try: + batch = tokenizer( + text=text, + max_length=3, + max_target_length=10, + return_tensors="pt", + ) + except NotImplementedError: + self.skipTest(reason="Encountered NotImplementedError when calling tokenizer") + self.assertEqual(batch.input_ids.shape[1], 3) + # max_target_length will default to max_length if not specified + batch = tokenizer(text, max_length=3, return_tensors="pt") + self.assertEqual(batch.input_ids.shape[1], 3) + + batch_encoder_only = tokenizer(text=text, max_length=3, max_target_length=10, return_tensors="pt") + self.assertEqual(batch_encoder_only.input_ids.shape[1], 3) + self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3) + self.assertNotIn("decoder_input_ids", batch_encoder_only) + + @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.") + def test_save_slow_from_fast_and_reload_fast(self): + pass + + def test_special_tokens_initialization(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + added_tokens = [AddedToken("", lstrip=True)] + + tokenizer_r = self.get_rust_tokenizer( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + r_output = tokenizer_r.encode("Hey this is a token") + + special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0] + + self.assertTrue(special_token_id in r_output) + + if self.test_slow_tokenizer: + tokenizer_cr = self.get_rust_tokenizer( + pretrained_name, + additional_special_tokens=added_tokens, + **kwargs, # , from_slow=True <- unfortunately too slow to convert + ) + tokenizer_p = self.tokenizer_class.from_pretrained( + pretrained_name, additional_special_tokens=added_tokens, **kwargs + ) + + p_output = tokenizer_p.encode("Hey this is a token") + + cr_output = tokenizer_cr.encode("Hey this is a token") + + self.assertEqual(p_output, r_output) + self.assertEqual(cr_output, r_output) + self.assertTrue(special_token_id in p_output) + self.assertTrue(special_token_id in cr_output) + + @slow + def test_tokenizer_integration(self): + expected_encoding = {'input_ids': [[1, 4103, 689, 414, 313, 24784, 368, 2998, 408, 282, 3637, 25350, 29899, 9067, 414, 322, 282, 3637, 25350, 29899, 1457, 3018, 1312, 29899, 2151, 29897, 8128, 2498, 29899, 15503, 4220, 6956, 1973, 313, 13635, 29911, 29892, 402, 7982, 29899, 29906, 29892, 1528, 13635, 29911, 29874, 29892, 1060, 26369, 29892, 6652, 309, 29933, 814, 29892, 1060, 29931, 6779, 11410, 363, 18385, 17088, 7634, 11235, 313, 25103, 29965, 29897, 322, 18385, 17088, 28203, 313, 25103, 29954, 29897, 411, 975, 29871, 29941, 29906, 29974, 758, 3018, 1312, 4733, 297, 29871, 29896, 29900, 29900, 29974, 10276, 322, 6483, 1006, 3372, 3097, 1546, 435, 1165, 29892, 10772, 29911, 25350, 322, 323, 6073, 17907, 29889], [1, 350, 20161, 338, 8688, 304, 758, 29899, 14968, 6483, 21000, 8684, 284, 22540, 515, 443, 29880, 24025, 1426, 491, 14002, 368, 4195, 292, 373, 1716, 2175, 322, 1492, 3030, 297, 599, 15359, 29889], [1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 29889]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip + + self.tokenizer_integration_test_util( + expected_encoding=expected_encoding, + model_name="hf-internal-testing/llama-tokenizer", + revision="0984d03108b1a041ed679bd253b6519b7e1a4778", + padding=False, + ) + + def test_picklable(self): + with tempfile.NamedTemporaryFile() as f: + shutil.copyfile(SAMPLE_VOCAB, f.name) + tokenizer = LlamaTokenizer(f.name, keep_accents=True) + pickled_tokenizer = pickle.dumps(tokenizer) + pickle.loads(pickled_tokenizer) + + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") + def test_pickle_subword_regularization_tokenizer(self): + pass + + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") + def test_subword_regularization_tokenizer(self): + pass + + def test_add_prefix_space(self): + pretrained_name = "hf-internal-testing/llama-tokenizer-non-normalized" + inputs = "Hey how are you doing" + EXPECTED_WITH_SPACE = [1, 18637, 920, 526, 366, 2599] + EXPECTED_WO_SPACE = [1, 29950, 1032, 920, 526, 366, 2599] + + slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False) + fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False) + self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE) + self.assertEqual(slow_.encode(inputs), fast_.encode(inputs)) + self.assertEqual(slow_.tokenize(inputs), ["H", "ey", "▁how", "▁are", "▁you", "▁doing"]) + self.assertEqual(slow_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True), inputs) + self.assertEqual( + slow_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True), + fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True), + ) + + slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False) + fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False) + self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE) + self.assertEqual(slow_.encode(inputs), fast_.encode(inputs)) + self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"]) + self.assertEqual(slow_.decode(EXPECTED_WITH_SPACE, skip_special_tokens=True), inputs) + self.assertEqual( + slow_.decode(EXPECTED_WITH_SPACE, skip_special_tokens=True), + fast_.decode(EXPECTED_WITH_SPACE, skip_special_tokens=True), + ) + + def test_load_tokenizer_with_model_file_only(self): + with tempfile.TemporaryDirectory() as tmp_dir: + hf_hub_download(repo_id="huggyllama/llama-7b", filename="tokenizer.model", local_dir=tmp_dir) + tokenizer_fast = self.rust_tokenizer_class.from_pretrained(tmp_dir) + self.assertEqual(tokenizer_fast.encode("This is a test"), [1, 910, 338, 263, 1243]) + + tokenizer_slow = self.tokenizer_class.from_pretrained(tmp_dir) + self.assertEqual(tokenizer_slow.encode("This is a test"), [1, 910, 338, 263, 1243]) + + +@require_torch +@require_sentencepiece +@require_tokenizers +class LlamaIntegrationTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + checkpoint_name = "hf-internal-testing/llama-tokenizer-non-normalized" + cls.tokenizer: LlamaTokenizer = LlamaTokenizer.from_pretrained(checkpoint_name) + cls.rust_tokenizer = LlamaTokenizerFast.from_pretrained(checkpoint_name) + return cls + + @require_torch + def integration_tests(self): + inputs = self.tokenizer( + ["The following string should be properly encoded: Hello.", "But ird and ปี ird ด"], + return_tensors="pt", + ) + + self.assertEqual( + nested_simplify(inputs), + { + "input_ids": [ + [1, 450, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889], + [1, 1205, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718], + ], + "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], + }, + ) + + def test_fast_special_tokens(self): + slow_tokenizer = self.tokenizer + fast_tokenizer = self.rust_tokenizer + slow = slow_tokenizer.encode("A sample test", add_special_tokens=True) + assert slow == [1, 319, 4559, 1243] + + fast_tokenizer.add_eos_token = False + fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) + assert fast == [1, 319, 4559, 1243] + + fast_tokenizer.add_eos_token = True + print(fast_tokenizer.add_eos_token) + fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) + assert fast == [1, 319, 4559, 1243, 2] + + slow_tokenizer.add_eos_token = True + slow = slow_tokenizer.encode("A sample test", add_special_tokens=True) + assert slow == [1, 319, 4559, 1243, 2] + + fast_tokenizer = LlamaTokenizerFast.from_pretrained( + "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False + ) + fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) + assert fast == [319, 4559, 1243, 2] + + slow_tokenizer = LlamaTokenizer.from_pretrained( + "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False + ) + slow = slow_tokenizer.encode("A sample test", add_special_tokens=True) + assert slow == [319, 4559, 1243, 2] + + self.tokenizer.add_eos_token = False + self.rust_tokenizer.add_eos_token = False + + @slow + def test_conversion(self): + # This is excruciatingly slow since it has to recreate the entire merge + # list from the original vocabulary in spm + self.rust_tokenizer.save_pretrained("./out") + with tempfile.TemporaryDirectory() as dirname: + self.rust_tokenizer.save_pretrained(dirname) + + with open(os.path.join(dirname, "tokenizer.json")) as f: + old_serialized = f.read() + + new_tokenizer = convert_slow_tokenizer(self.tokenizer) + with tempfile.NamedTemporaryFile() as f: + new_tokenizer.save(f.name) + # Re-opening since `f` is in bytes. + new_serialized = open(f.name).read() + with open("out_tokenizer.json", "w") as g: + g.write(new_serialized) + + self.assertEqual(old_serialized, new_serialized) + + def test_simple_encode_decode(self): + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + + self.assertEqual(pyth_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243]) + self.assertEqual(rust_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243]) + self.assertEqual(pyth_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test") + self.assertEqual(rust_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test") + + # bytefallback showcase + self.assertEqual(pyth_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392]) # fmt: skip + self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392]) # fmt: skip + self.assertEqual( + pyth_tokenizer.decode( + [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True + ), + "生活的真谛是", + ) + self.assertEqual( + rust_tokenizer.decode( + [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True + ), + "生活的真谛是", + ) + + # Inner spaces showcase + self.assertEqual(pyth_tokenizer.encode("Hi Hello"), [1, 6324, 29871, 15043]) + self.assertEqual(rust_tokenizer.encode("Hi Hello"), [1, 6324, 29871, 15043]) + self.assertEqual(pyth_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi Hello") + self.assertEqual(rust_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi Hello") + + self.assertEqual(pyth_tokenizer.encode("Hi Hello"), [1, 6324, 259, 15043]) + self.assertEqual(rust_tokenizer.encode("Hi Hello"), [1, 6324, 259, 15043]) + self.assertEqual(pyth_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi Hello") + self.assertEqual(rust_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi Hello") + + self.assertEqual(pyth_tokenizer.encode(""), [1]) + self.assertEqual(rust_tokenizer.encode(""), [1]) + + self.assertEqual(pyth_tokenizer.encode(" "), [1, 259]) + self.assertEqual(rust_tokenizer.encode(" "), [1, 259]) + + self.assertEqual(pyth_tokenizer.encode(" "), [1, 1678]) + self.assertEqual(rust_tokenizer.encode(" "), [1, 1678]) + + self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043]) + self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043]) + + def test_no_differences_showcase(self): + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + self.assertEqual(pyth_tokenizer.encode(""), [1]) + self.assertEqual(rust_tokenizer.encode(""), [1]) + + self.assertEqual(pyth_tokenizer.encode(" "), [1, 259]) + self.assertEqual(rust_tokenizer.encode(" "), [1, 259]) + + self.assertEqual(pyth_tokenizer.encode(" "), [1, 1678]) + self.assertEqual(rust_tokenizer.encode(" "), [1, 1678]) + + self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043]) + self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043]) + + self.assertEqual(pyth_tokenizer.encode(""), [1, 1]) + self.assertEqual(rust_tokenizer.encode(""), [1, 1]) + + def test_no_differences_decode(self): + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + + self.assertEqual(pyth_tokenizer.decode([869]), ".") + self.assertEqual(rust_tokenizer.decode([869]), ".") + + self.assertEqual(pyth_tokenizer.decode([30112, 869]), "ا .") + self.assertEqual(rust_tokenizer.decode([30112, 869]), "ا .") + + def test_no_differences_special_tokens(self): + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + self.assertEqual(pyth_tokenizer.encode(""), [1]) + self.assertEqual(rust_tokenizer.encode(""), [1]) + + self.assertEqual(pyth_tokenizer.encode(""), [1, 1]) + self.assertEqual(rust_tokenizer.encode(""), [1, 1]) + + @unittest.skipIf( + os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0", + "RUN_TOKENIZER_INTEGRATION=1 to run tokenizer integration tests", + ) + def test_integration_test_xnli(self): + import tqdm + + pyth_tokenizer = self.tokenizer + rust_tokenizer = self.rust_tokenizer + + dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go") + for item in tqdm.tqdm(dataset["validation"]): + string = item["code"] + encoded1 = pyth_tokenizer.encode(string) + encoded2 = rust_tokenizer.encode(string) + + self.assertEqual(encoded1, encoded2) + + decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True) + decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True) + + self.assertEqual(decoded1, decoded2) + + dataset = load_dataset("facebook/xnli", "all_languages") + + for item in tqdm.tqdm(dataset["train"]): + for string in item["premise"].values(): + encoded1 = pyth_tokenizer.encode(string) + encoded2 = rust_tokenizer.encode(string) + + self.assertEqual(encoded1, encoded2) + + decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True) + decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True) + + self.assertEqual(decoded1, decoded2) + + def test_special_token_special_word(self): + # the word inform should be split as ['in', 'form'] + tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True) + tokenizer.add_tokens([AddedToken("", rstrip=True, lstrip=True)], special_tokens=False) + + example_inputs = tokenizer.tokenize("inform. Hey. .") + self.assertEqual(example_inputs, ["", "in", "form", "", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."]) + + # Make sure dummy space is added if it is indeed the first word + example_inputs = tokenizer.tokenize("inform. Hey. .") + self.assertEqual(example_inputs, ["▁inform", "", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."]) + out1 = tokenizer.decode( + tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False + ) + self.assertEqual(out1, "inform") + out2 = tokenizer.decode( + tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True + ) + # decoding strips the added prefix space. + self.assertEqual(out2, "inform") + input_ids = tokenizer.encode("inform", add_special_tokens=False) + self.assertEqual(input_ids, [32000, 262, 689]) # 29871 is the spiece underline, '▁' added as it should + + out2 = tokenizer.decode( + tokenizer.encode(" inform", add_special_tokens=False), spaces_between_special_tokens=False + ) + # TODO @ArthurZ currently we strip left and right, so this will not keep the spaces + self.assertEqual(out2, "inform") + + ### Let's make sure decoding does not add extra spaces here and there + # TODO @ArthurZ this should be affected by the lstrip/rstrip/single word /normalize refactoring + # Since currently we always strip left and right of the token, results are as such + input_ids = tokenizer.encode(" Hellohow", add_special_tokens=False) + self.assertEqual(input_ids, [1, 15043, 1, 3525]) + tokens = tokenizer.tokenize(" Hellohow", add_special_tokens=False) + self.assertEqual(tokens, ["", "▁Hello", "", "how"]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, " Hellohow") + + # Let's make sure that if there are any spaces, we don't remove them! + input_ids = tokenizer.encode(" Hello how", add_special_tokens=False) + self.assertEqual(input_ids, [29871, 1, 15043, 1, 920]) + tokens = tokenizer.tokenize(" Hello how", add_special_tokens=False) + self.assertEqual(tokens, ["▁", "", "▁Hello", "", "▁how"]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, " Hello how") + + # Let's make sure the space is preserved + input_ids = tokenizer.encode("hello", add_special_tokens=True) + self.assertEqual(input_ids, [1, 22172]) + tokens = tokenizer.tokenize("hello") + self.assertEqual(tokens, ["▁hello"]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, " hello") + + input_ids = tokenizer.encode("hello", add_special_tokens=False) + self.assertEqual(input_ids, [22172]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, "hello") + + def test_no_prefix_space(self): + tokenizer_no_prefix_space = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", add_prefix_space=False) + no_prefix_space_tokens = tokenizer_no_prefix_space.tokenize("Hey") + self.assertEqual(no_prefix_space_tokens, ["H", "ey"]) + + tokenizer = LlamaTokenizerFast.from_pretrained( + "huggyllama/llama-7b", legacy=False, from_slow=True, add_prefix_space=False + ) + tokenizer.add_tokens([AddedToken("", rstrip=True, lstrip=True)], special_tokens=False) + + example_inputs = tokenizer.tokenize("inform. Hey. .") + self.assertEqual(example_inputs, ["", "in", "form", "", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."]) + + # Make sure dummy space is added if it is indeed the first word + example_inputs = tokenizer.tokenize("inform. Hey. .") + self.assertEqual(example_inputs, ["in", "form", "", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."]) + out1 = tokenizer.decode( + tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False + ) + self.assertEqual(out1, "inform") + out2 = tokenizer.decode( + tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True + ) + # decoding strips the added prefix space. + self.assertEqual(out2, "inform") + input_ids = tokenizer.encode("inform", add_special_tokens=False) + self.assertEqual(input_ids, [32000, 262, 689]) # 29871 is the spiece underline, '▁' added as it should + + out2 = tokenizer.decode( + tokenizer.encode(" inform", add_special_tokens=False), spaces_between_special_tokens=False + ) + self.assertEqual(out2, "inform") + + input_ids = tokenizer.encode(" Hellohow", add_special_tokens=False) + self.assertEqual(input_ids, [1, 15043, 1, 3525]) + tokens = tokenizer.tokenize(" Hellohow", add_special_tokens=False) + self.assertEqual(tokens, ["", "▁Hello", "", "how"]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, " Hellohow") + + # Let's make sure that if there are any spaces, we don't remove them! + input_ids = tokenizer.encode(" Hello how", add_special_tokens=False) + self.assertEqual(input_ids, [29871, 1, 15043, 1, 920]) + tokens = tokenizer.tokenize(" Hello how", add_special_tokens=False) + self.assertEqual(tokens, ["▁", "", "▁Hello", "", "▁how"]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, " Hello how") + + # Let's make sure the space is preserved + input_ids = tokenizer.encode("hello", add_special_tokens=True) + self.assertEqual(input_ids, [1, 12199]) + tokens = tokenizer.tokenize("hello") + self.assertEqual(tokens, ["hello"]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, "hello") + + input_ids = tokenizer.encode("hello", add_special_tokens=False) + self.assertEqual(input_ids, [12199]) + decoded_tokens = tokenizer.decode(input_ids) + self.assertEqual(decoded_tokens, "hello") + + def test_some_edge_cases(self): + tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False) + + sp_tokens = tokenizer.sp_model.encode(">", out_type=str) + self.assertEqual(sp_tokens, ["<", "s", ">>"]) + tokens = tokenizer.tokenize(">") + self.assertNotEqual(sp_tokens, tokens) + self.assertEqual(tokens, ["", ">"]) + + tokens = tokenizer.tokenize("") + self.assertEqual(tokens, []) + self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str)) + + tokens = tokenizer.tokenize(" ") + self.assertEqual(tokens, ["▁▁"]) + # a dummy prefix space is not added by the sp_model as it was de-activated + self.assertEqual(tokens, tokenizer.sp_model.encode(" ", out_type=str)) + + tokens = tokenizer.tokenize("▁") + self.assertEqual(tokens, ["▁▁"]) + # a dummy prefix space is not added by the sp_model as it was de-activated + self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str)) + + tokens = tokenizer.tokenize(" ▁") + self.assertEqual(tokens, ["▁▁▁"]) + # a dummy prefix space is not added by the sp_model as it was de-activated + self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁▁", out_type=str)) + + def test_fast_post_processor(self): + tokenizer = LlamaTokenizerFast( + SAMPLE_VOCAB, eos_token=None, bos_token=None, add_bos_token=False, add_eos_token=False + ) + tokenizer.encode(" Hey ") + + with self.assertRaises(ValueError): + tokenizer = LlamaTokenizerFast( + SAMPLE_VOCAB, bos_token=None, eos_token="", add_bos_token=True, add_eos_token=False + ) + with self.assertRaises(ValueError): + tokenizer = LlamaTokenizerFast(SAMPLE_VOCAB, eos_token=None, add_bos_token=True, add_eos_token=True) + + @require_jinja + def test_tokenization_for_chat(self): + tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False) + + test_chats = [ + [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], + [ + {"role": "system", "content": "You are a helpful chatbot."}, + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Nice to meet you."}, + ], + [{"role": "user", "content": "Hello!"}], + ] + # Matt: The third test case tests the default system message, but if this is ever changed in the + # class/repo code then that test will fail, and the case will need to be updated. + tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats] + # fmt: off + expected_tokens = [ + [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962], + [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962, 20103, 304, 5870, 366, 29889, 29871, 2], + [1, 29961, 25580, 29962, 15043, 29991, 518, 29914, 25580, 29962] + ] + # fmt: on + for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): + self.assertListEqual(tokenized_chat, expected_tokens) + + +@require_sentencepiece +@require_tokenizers +class CommonSpmIntegrationTests(unittest.TestCase): + """ + A class that regroups important test to make sure that we properly handle the special tokens. + """ + + @classmethod + def setUpClass(cls): + tokenizer = LlamaTokenizer(SAMPLE_VOCAB, extra_ids=0, add_bos_token=False, legacy=False) + tokenizer.add_special_tokens({"additional_special_tokens": [AddedToken("", rstrip=False, lstrip=False)]}) + cls.tokenizer = tokenizer + return cls + + def test_add_dummy_prefix(self): + # make sure `'▁'` is prepended, and outputs match sp_model's + # `sentencepiece.NormalizerSpec.add_dummy_prefix` attribute + input_ids = self.tokenizer.encode(". Hello") + self.assertEqual(input_ids, [7, 4, 156, 86, 20]) + sp_encode = self.tokenizer.sp_model.encode(". Hello") + self.assertEqual(input_ids, [7] + sp_encode) + tokens = self.tokenizer.tokenize(". Hello") + self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"]) + + tokens = self.tokenizer.tokenize("") + self.assertEqual(tokens, []) + self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str)) + + tokens = self.tokenizer.tokenize(" ") + self.assertEqual(tokens, []) + self.assertEqual(tokens, self.tokenizer.sp_model.encode(" ", out_type=str)) + + tokens = self.tokenizer.tokenize("▁") + self.assertEqual(tokens, []) + self.assertEqual(tokens, self.tokenizer.sp_model.encode("▁", out_type=str)) + + def test_remove_extra_whitespaces(self): + # make sure the extra spaces are eaten. Since the sample vocab does not have + # `______`. sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute is set to False + + input_ids = self.tokenizer.encode(" . Hello") + self.assertEqual(input_ids, [7, 4, 156, 86, 20]) + sp_encode = self.tokenizer.sp_model.encode(" . Hello") + self.assertEqual(input_ids, [7] + sp_encode) + tokens = self.tokenizer.tokenize(" . Hello") + self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"]) + + # `'▁'` is also a whitespace + input_ids = self.tokenizer.encode("▁He is not") + self.assertEqual(input_ids, [156, 46, 44]) + tokens = self.tokenizer.tokenize("▁He is not") + sp_encode = [ + self.tokenizer.sp_model.piece_to_id("▁He"), + self.tokenizer.sp_model.piece_to_id("▁is"), + self.tokenizer.sp_model.piece_to_id("▁not"), + ] + self.assertEqual(input_ids, sp_encode) + self.assertEqual(tokens, ["▁He", "▁is", "▁not"]) # no extra space added + + input_ids = self.tokenizer.encode("▁He is not ▁He") + self.assertEqual(input_ids, [156, 46, 44, 1, 156]) + tokens = self.tokenizer.tokenize("▁He is not ▁He") + self.assertEqual(tokens, ["▁He", "▁is", "▁not", "", "▁He"]) # spaces are eaten by spm + our strip + # make sure that the output after the extra id is the same as if + # extra_id was not there + input_ids = self.tokenizer.encode("▁He is not ▁He") + self.assertEqual(input_ids, [156, 46, 44, 156]) + tokens = self.tokenizer.tokenize("▁He is not ▁He") + self.assertEqual(tokens, ["▁He", "▁is", "▁not", "▁He"]) # spaces are eaten by spm even if not start + + def test_character_after_special_token(self): + # Make sure that `tokenizer.tokenize` is similar to + # adding the equivalent special token to the vocab + input_ids = self.tokenizer.encode("Hey I") + self.assertEqual(input_ids, [156, 30, 1, 100]) + sp_encode = self.tokenizer.sp_model.encode("Hey .I") + # the last token should be 100 + self.assertEqual(input_ids[-1], sp_encode[-1]) + tokens = self.tokenizer.tokenize("I") + self.assertEqual(tokens, ["", "I"]) + + input_ids = self.tokenizer.encode("Hello, ,") + self.assertEqual(input_ids, [156, 86, 20, 3, 1, 3]) + tokens = self.tokenizer.tokenize("Hello, ,") + self.assertEqual(tokens, ["▁He", "ll", "o", ",", "", ","]) + + def test_special_tokens_strip(self): + input_ids = self.tokenizer.encode(" ,") + self.assertEqual(input_ids, [1, 7, 3]) + tokens = self.tokenizer.tokenize(" ,") + # spaces are eaten by rstrip / lstrip + spm sp_model.encode(" ") = [] + self.assertEqual(tokens, ["", "▁", ","]) + + input_ids = self.tokenizer.encode("No ▁He") + self.assertEqual(input_ids, [284, 1, 156]) + tokens = self.tokenizer.tokenize("No ▁He") + self.assertEqual(tokens, ["▁No", "", "▁He"]) # spaces are eaten by rstrip / lstrip + + +@require_tiktoken +@require_read_token +class TikTokenIntegrationTests(unittest.TestCase): + """ + A class that regroups important test to make sure that we properly handle the special tokens. + """ + + def test_tiktoken_llama(self): + model_path = "hf-internal-testing/llama-3-8b-internal" + subfolder = "original" + test_text = "This is a test sentence." + test_tokens = [128000, 2028, 374, 264, 1296, 11914, 13, 128001] + num_reserved_special_tokens = 256 + special_tokens = [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|reserved_special_token_2|>", + "<|reserved_special_token_3|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|reserved_special_token_4|>", + "<|eot_id|>", + "<|python_tag|>", # end of turn + ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)] + + tiktoken_tokenizer = PreTrainedTokenizerFast.from_pretrained( + model_path, + subfolder=subfolder, + additional_special_tokens=special_tokens, + bos_token="<|begin_of_text|>", + eos_token="<|end_of_text|>", + ) + tokens = tiktoken_tokenizer.tokenize("<|begin_of_text|> " + test_text) + self.assertEqual(tokens[0], "<|begin_of_text|>") + + tiktoken_tokenizer = AutoTokenizer.from_pretrained( + model_path, + subfolder=subfolder, + legacy=False, + additional_special_tokens=special_tokens, + bos_token="<|begin_of_text|>", + eos_token="<|end_of_text|>", + add_bos_token=True, + add_eos_token=True, + ) + self.assertTrue(isinstance(tiktoken_tokenizer, PreTrainedTokenizerFast)) + + tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True) + self.assertEqual(tokens, test_tokens) + + tmpdirname = tempfile.mkdtemp() + tiktoken_tokenizer.save_pretrained(tmpdirname) + tokenizer_reload = AutoTokenizer.from_pretrained(tmpdirname) + + self.assertTrue(isinstance(tokenizer_reload, PreTrainedTokenizerFast)) + tokens = tokenizer_reload.encode(test_text, add_special_tokens=True) + self.assertEqual(tokens, test_tokens) + shutil.rmtree(tmpdirname) + + tiktoken_tokenizer = AutoTokenizer.from_pretrained( + model_path, + subfolder=subfolder, + additional_special_tokens=special_tokens, + bos_token="<|begin_of_text|>", + eos_token="<|end_of_text|>", + from_slow=True, + add_bos_token=True, + add_eos_token=True, + ) + tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True) + self.assertEqual(tokens, test_tokens) diff --git a/docs/transformers/tests/models/llama4/__init__.py b/docs/transformers/tests/models/llama4/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/llama4/test_image_processing_llama4.py b/docs/transformers/tests/models/llama4/test_image_processing_llama4.py new file mode 100644 index 0000000000000000000000000000000000000000..dfc39f4ff490b60a3ffd71623fc4745f0bbd6779 --- /dev/null +++ b/docs/transformers/tests/models/llama4/test_image_processing_llama4.py @@ -0,0 +1,131 @@ +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + pass + +if is_vision_available() and is_torchvision_available(): + from transformers import Llama4ImageProcessorFast + + +class Llama4ImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + max_patches=1, + do_resize=True, + size=None, + do_normalize=True, + do_pad=False, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_convert_rgb=True, + ): + super().__init__() + size = size if size is not None else {"height": 20, "width": 20} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.max_patches = max_patches + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_pad = do_pad + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "max_patches": self.max_patches, + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + "do_pad": self.do_pad, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class Llama4ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + test_slow_image_processor = False + fast_image_processing_class = Llama4ImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = Llama4ImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processor, "do_resize")) + self.assertTrue(hasattr(image_processor, "size")) + self.assertTrue(hasattr(image_processor, "do_normalize")) + self.assertTrue(hasattr(image_processor, "image_mean")) + self.assertTrue(hasattr(image_processor, "image_std")) + self.assertTrue(hasattr(image_processor, "do_convert_rgb")) + + def test_split_tiles(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class(**self.image_processor_dict) + image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)[0] + processed_images = image_processor( + image, + max_patches=16, + ) + self.assertEqual(len(processed_images.pixel_values), 1) + self.assertEqual(processed_images.pixel_values[0].shape[0], 17) + self.assertEqual(processed_images.pixel_values[0].shape[-2:], (20, 20)) + + @unittest.skip("Broken on main right now. Should be fixable!") + def test_image_processor_save_load_with_autoimageprocessor(self): + pass diff --git a/docs/transformers/tests/models/llama4/test_modeling_llama4.py b/docs/transformers/tests/models/llama4/test_modeling_llama4.py new file mode 100644 index 0000000000000000000000000000000000000000..f3f1d137d17411cac73d87256386daa7a1557592 --- /dev/null +++ b/docs/transformers/tests/models/llama4/test_modeling_llama4.py @@ -0,0 +1,120 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Llama4 model.""" + +import unittest + +from transformers import is_torch_available +from transformers.testing_utils import ( + require_read_token, + require_torch_large_gpu, + slow, + torch_device, +) + + +if is_torch_available(): + import torch + + from transformers import ( + Llama4ForConditionalGeneration, + Llama4Processor, + ) + + +@slow +@require_torch_large_gpu +@require_read_token +class Llama4IntegrationTest(unittest.TestCase): + model_id = "ll-re/Llama-4-17B-Omni-Instruct" + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + cls.model = Llama4ForConditionalGeneration.from_pretrained( + "ll-re/Llama-4-17B-Omni-Instruct", device_map="auto", torch_dtype=torch.float32 + ) + + def setUp(self): + self.processor = Llama4Processor.from_pretrained("ll-re/Llama-4-17B-Omni-Instruct", padding_side="left") + + url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png" + self.messages = [ + {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, + { + "role": "user", + "content": [ + {"type": "image", "url": url}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, + ] + + def test_model_17b_16e_fp16(self): + EXPECTED_TEXT = [ + "The capital of France is Paris, which is located in the north-central part of the country. Paris is known for its iconic landmarks such as the", + "Roses are red, violets are blue, and this poem is about you. Roses are red, violets are blue, and I love", + ] + + messages = [ + {"role": "user", "content": "Who are you?"}, + ] + inputs = self.processor.apply_chat_template( + messages, add_generation_prompt=True, return_tensors="pt", return_dict=True + ).to(torch_device) + + output = self.model.generate(**inputs, max_new_tokens=100) + output_text = self.processor.batch_decode(output, skip_special_tokens=True) + + print(output_text) + self.assertEqual(output_text, EXPECTED_TEXT) + + def test_model_17b_16e_batch(self): + messages_2 = [ + {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]}, + { + "role": "user", + "content": [ + { + "type": "image", + "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png", + }, + {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}, + {"type": "text", "text": "Are these images identical?"}, + ], + }, + ] + + inputs = self.processor.apply_chat_template( + [self.messages, messages_2], + tokenize=True, + return_dict=True, + return_tensors="pt", + padding=True, + add_generation_prompt=True, + ).to(torch_device) + + output = self.model.generate(**inputs, max_new_tokens=30, do_sample=False) + output_text = self.processor.batch_decode(output, skip_special_tokens=True) + + EXPECTED_TEXTS = [ + 'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like', + "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Image 1:** Shows a cow" + ] # fmt: skip + self.assertEqual(output_text, EXPECTED_TEXTS) diff --git a/docs/transformers/tests/models/llama4/test_processor_llama4.py b/docs/transformers/tests/models/llama4/test_processor_llama4.py new file mode 100644 index 0000000000000000000000000000000000000000..aef3539a37ea0ff3f4ea9620a926c7d5a80ef704 --- /dev/null +++ b/docs/transformers/tests/models/llama4/test_processor_llama4.py @@ -0,0 +1,53 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest + +from transformers import AutoProcessor, Llama4Processor, PreTrainedTokenizerFast +from transformers.testing_utils import require_vision +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import Llama4ImageProcessorFast + + +@require_vision +class Llama4ProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = Llama4Processor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + image_processor = Llama4ImageProcessorFast(max_patches=1, size={"height": 20, "width": 20}) + tokenizer = PreTrainedTokenizerFast.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit") + processor_kwargs = cls.prepare_processor_dict() + processor = Llama4Processor(image_processor, tokenizer, **processor_kwargs) + processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname) diff --git a/docs/transformers/tests/models/llava/__init__.py b/docs/transformers/tests/models/llava/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/llava/test_configuration_llava.py b/docs/transformers/tests/models/llava/test_configuration_llava.py new file mode 100644 index 0000000000000000000000000000000000000000..3b28adc1ee66b6ad4404ca46a4880cefd2b9ddd0 --- /dev/null +++ b/docs/transformers/tests/models/llava/test_configuration_llava.py @@ -0,0 +1,70 @@ +import tempfile +import unittest + +from transformers import LlavaConfig + + +class LlavaConfigTest(unittest.TestCase): + def test_llava_reload(self): + """ + Simple test for reloading default llava configs + """ + with tempfile.TemporaryDirectory() as tmp_dir: + config = LlavaConfig() + config.save_pretrained(tmp_dir) + + reloaded = LlavaConfig.from_pretrained(tmp_dir) + assert config.to_dict() == reloaded.to_dict() + + def test_pixtral_reload(self): + """ + Simple test for reloading pixtral configs + """ + vision_config = { + "model_type": "pixtral", + "head_dim": 64, + "hidden_act": "silu", + "image_size": 1024, + "is_composition": True, + "patch_size": 16, + "rope_theta": 10000.0, + "tie_word_embeddings": False, + } + + text_config = { + "model_type": "mistral", + "hidden_size": 5120, + "head_dim": 128, + "num_attention_heads": 32, + "intermediate_size": 14336, + "is_composition": True, + "max_position_embeddings": 1024000, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000000.0, + "sliding_window": None, + "vocab_size": 131072, + } + + with tempfile.TemporaryDirectory() as tmp_dir: + config = LlavaConfig(vision_config=vision_config, text_config=text_config) + config.save_pretrained(tmp_dir) + + reloaded = LlavaConfig.from_pretrained(tmp_dir) + assert config.to_dict() == reloaded.to_dict() + + def test_arbitrary_reload(self): + """ + Simple test for reloading arbirarily composed subconfigs + """ + default_values = LlavaConfig().to_diff_dict() + default_values["vision_config"]["model_type"] = "pixtral" + default_values["text_config"]["model_type"] = "opt" + self.maxDiff = None + with tempfile.TemporaryDirectory() as tmp_dir: + config = LlavaConfig(**default_values) + config.save_pretrained(tmp_dir) + + reloaded = LlavaConfig.from_pretrained(tmp_dir) + self.assertDictEqual(config.to_dict(), reloaded.to_dict()) diff --git a/docs/transformers/tests/models/llava/test_image_processing_llava.py b/docs/transformers/tests/models/llava/test_image_processing_llava.py new file mode 100644 index 0000000000000000000000000000000000000000..b91c44d82b067b31a2b0f92a1cc68b1ea173a577 --- /dev/null +++ b/docs/transformers/tests/models/llava/test_image_processing_llava.py @@ -0,0 +1,238 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest +from typing import Union + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_vision_available(): + from PIL import Image + + from transformers import LlavaImageProcessor + + if is_torchvision_available(): + from torchvision.transforms import functional as F + + from transformers import LlavaImageProcessorFast + + +class LlavaImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_pad=True, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + do_convert_rgb=True, + ): + super().__init__() + size = size if size is not None else {"shortest_edge": 20} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_pad = do_pad + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "do_pad": self.do_pad, + "do_resize": self.do_resize, + "size": self.size, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape + def expected_output_image_shape(self, images): + return self.num_channels, self.crop_size["height"], self.crop_size["width"] + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest with CLIP->Llava +class LlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = LlavaImageProcessor if is_vision_available() else None + fast_image_processing_class = LlavaImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = LlavaImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + # Ignore copy + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_pad")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 20}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) + self.assertEqual(image_processor.size, {"shortest_edge": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) + + # Ignore copy + def test_padding(self): + """ + LLaVA needs to pad images to square size before processing as per orig implementation. + Checks that image processor pads images correctly given different background colors. + """ + + # taken from original implementation: https://github.com/haotian-liu/LLaVA/blob/c121f0432da27facab705978f83c4ada465e46fd/llava/mm_utils.py#L152 + def pad_to_square_original( + image: Image.Image, background_color: Union[int, tuple[int, int, int]] = 0 + ) -> Image.Image: + width, height = image.size + if width == height: + return image + elif width > height: + result = Image.new(image.mode, (width, width), background_color) + result.paste(image, (0, (width - height) // 2)) + return result + else: + result = Image.new(image.mode, (height, height), background_color) + result.paste(image, ((height - width) // 2, 0)) + return result + + for i, image_processing_class in enumerate(self.image_processor_list): + image_processor = image_processing_class.from_dict(self.image_processor_dict) + numpify = i == 0 + torchify = i == 1 + image_inputs = self.image_processor_tester.prepare_image_inputs( + equal_resolution=False, numpify=numpify, torchify=torchify + ) + + # test with images in channel-last and channel-first format (only channel-first for torch) + for image in image_inputs: + padded_image = image_processor.pad_to_square(image) + if i == 0: + padded_image_original = pad_to_square_original(Image.fromarray(image)) + padded_image_original = np.array(padded_image_original) + + np.testing.assert_allclose(padded_image, padded_image_original) + + padded_image = image_processor.pad_to_square( + image.transpose(2, 0, 1), input_data_format="channels_first" + ) + padded_image = padded_image.transpose(1, 2, 0) + + np.testing.assert_allclose(padded_image, padded_image_original) + else: + padded_image_original = pad_to_square_original(F.to_pil_image(image)) + padded_image = padded_image.permute(1, 2, 0) + np.testing.assert_allclose(padded_image, padded_image_original) + + # test background color + background_color = (122, 116, 104) + for image in image_inputs: + padded_image = image_processor.pad_to_square(image, background_color=background_color) + if i == 0: + padded_image_original = pad_to_square_original( + Image.fromarray(image), background_color=background_color + ) + else: + padded_image_original = pad_to_square_original( + F.to_pil_image(image), background_color=background_color + ) + padded_image = padded_image.permute(1, 2, 0) + padded_image_original = np.array(padded_image_original) + + np.testing.assert_allclose(padded_image, padded_image_original) + + background_color = 122 + for image in image_inputs: + padded_image = image_processor.pad_to_square(image, background_color=background_color) + if i == 0: + padded_image_original = pad_to_square_original( + Image.fromarray(image), background_color=background_color + ) + else: + padded_image_original = pad_to_square_original( + F.to_pil_image(image), background_color=background_color + ) + padded_image = padded_image.permute(1, 2, 0) + padded_image_original = np.array(padded_image_original) + np.testing.assert_allclose(padded_image, padded_image_original) + + # background color length should match channel length + with self.assertRaises(ValueError): + padded_image = image_processor.pad_to_square(image_inputs[0], background_color=(122, 104)) + + with self.assertRaises(ValueError): + padded_image = image_processor.pad_to_square(image_inputs[0], background_color=(122, 104, 0, 0)) + + @unittest.skip(reason="LLaVa does not support 4 channel images yet") + # Ignore copy + def test_call_numpy_4_channels(self): + pass diff --git a/docs/transformers/tests/models/llava/test_modeling_llava.py b/docs/transformers/tests/models/llava/test_modeling_llava.py new file mode 100644 index 0000000000000000000000000000000000000000..1072d9043ea0e6ad4aaaedb85a2d99f0b6931aa5 --- /dev/null +++ b/docs/transformers/tests/models/llava/test_modeling_llava.py @@ -0,0 +1,630 @@ +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Llava model.""" + +import unittest + +import requests +from parameterized import parameterized + +from transformers import ( + AutoProcessor, + AutoTokenizer, + LlavaConfig, + LlavaForConditionalGeneration, + is_torch_available, + is_vision_available, +) +from transformers.testing_utils import ( + cleanup, + require_bitsandbytes, + require_torch, + require_vision, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor + + +if is_torch_available(): + import torch + + +if is_vision_available(): + from PIL import Image + + +class LlavaVisionText2TextModelTester: + def __init__( + self, + parent, + ignore_index=-100, + image_token_index=0, + projector_hidden_act="gelu", + seq_length=7, + vision_feature_select_strategy="default", + vision_feature_layer=-1, + text_config={ + "model_type": "llama", + "seq_length": 7, + "is_training": True, + "use_input_mask": True, + "use_token_type_ids": False, + "use_labels": True, + "vocab_size": 99, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 512, + "type_vocab_size": 16, + "type_sequence_label_size": 2, + "initializer_range": 0.02, + "num_labels": 3, + "num_choices": 4, + "pad_token_id": 1, + }, + is_training=True, + vision_config={ + "image_size": 8, + "patch_size": 2, + "num_channels": 3, + "is_training": True, + "hidden_size": 32, + "projection_dim": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "dropout": 0.1, + "attention_dropout": 0.1, + "initializer_range": 0.02, + }, + ): + self.parent = parent + self.ignore_index = ignore_index + self.image_token_index = image_token_index + self.projector_hidden_act = projector_hidden_act + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + self.text_config = text_config + self.vision_config = vision_config + self.pad_token_id = text_config["pad_token_id"] + + self.num_hidden_layers = text_config["num_hidden_layers"] + self.vocab_size = text_config["vocab_size"] + self.hidden_size = text_config["hidden_size"] + self.num_attention_heads = text_config["num_attention_heads"] + self.is_training = is_training + + self.batch_size = 3 + self.num_channels = 3 + self.image_size = 336 + self.num_image_tokens = (self.vision_config["image_size"] // self.vision_config["patch_size"]) ** 2 + self.seq_length = seq_length + self.num_image_tokens + self.encoder_seq_length = self.seq_length + + def get_config(self): + return LlavaConfig( + text_config=self.text_config, + vision_config=self.vision_config, + ignore_index=self.ignore_index, + image_token_index=self.image_token_index, + projector_hidden_act=self.projector_hidden_act, + vision_feature_select_strategy=self.vision_feature_select_strategy, + vision_feature_layer=self.vision_feature_layer, + image_seq_length=self.num_image_tokens, + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + self.vision_config["num_channels"], + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + config = self.get_config() + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 + attention_mask = input_ids.ne(1).to(torch_device) + input_ids[input_ids == config.image_token_index] = self.pad_token_id + input_ids[:, : self.num_image_tokens] = config.image_token_index + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + +@require_torch +class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + """ + Model tester for `LlavaForConditionalGeneration`. + """ + + all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = ( + {"image-to-text": LlavaForConditionalGeneration, "image-text-to-text": LlavaForConditionalGeneration} + if is_torch_available() + else {} + ) + test_pruning = False + test_head_masking = False + _is_composite = True + + def setUp(self): + self.model_tester = LlavaVisionText2TextModelTester(self) + common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + # while some other models require pixel_values to be present + def test_inputs_embeds_matches_input_ids(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + inputs_embeds = model.get_input_embeddings()(input_ids) + + with torch.no_grad(): + out_ids = model(input_ids=input_ids, **inputs)[0] + out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] + torch.testing.assert_close(out_embeds, out_ids) + + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images doesn't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successful forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + @parameterized.expand( + [ + (-1,), + ([-1],), + ([-1, -2],), + ], + ) + def test_vision_feature_layers(self, vision_feature_layer): + """ + Test that we can use either one vision feature layer, or a list of + vision feature layers. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.vision_feature_layer = vision_feature_layer + + num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer) + hidden_size = config.vision_config.hidden_size + expected_features = hidden_size * num_feature_layers + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + # We should have the right number of input features, + # and should be able to run a forward pass without exploding + assert model.multi_modal_projector.linear_1.in_features == expected_features + model(**input_dict) + + @unittest.skip( + reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip( + "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" + ) + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + pass + + +@require_torch +class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("llava-hf/bakLlava-v1-hf") + + def tearDown(self): + cleanup(torch_device, gc_collect=True) + + @slow + @require_bitsandbytes + def test_small_model_integration_test(self): + # Let's make sure we test the preprocessing to replace what is used + model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True) + + prompt = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:" + image_file = "https://llava-vl.github.io/static/images/view.jpg" + raw_image = Image.open(requests.get(image_file, stream=True).raw) + inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20) + EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly," # fmt: skip + + self.assertEqual( + self.processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_llama_single(self): + # Let's make sure we test the preprocessing to replace what is used + model_id = "llava-hf/llava-1.5-7b-hf" + + model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + prompt = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT:" + image_file = "https://llava-vl.github.io/static/images/view.jpg" + raw_image = Image.open(requests.get(image_file, stream=True).raw) + inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) + + output = model.generate(**inputs, max_new_tokens=900, do_sample=False) + EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area." # fmt: skip + + self.assertEqual( + processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_llama_batched(self): + # Let's make sure we test the preprocessing to replace what is used + model_id = "llava-hf/llava-1.5-7b-hf" + + model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + prompts = [ + "USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT:", + "USER: \nWhat is this? ASSISTANT:", + ] + image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw) + image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + + inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20) + + EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, you', 'USER: \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat is located on'] # fmt: skip + + self.assertEqual( + processor.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_batch(self): + # Let's make sure we test the preprocessing to replace what is used + model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True) + # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!. + prompts = [ + "USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:", + "USER: \nWhat is this?\nASSISTANT:", + ] + image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw) + image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + + inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to( + torch_device + ) + + output = model.generate(**inputs, max_new_tokens=20) + + EXPECTED_DECODED_TEXT = [ + 'USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.', + 'USER: \nWhat is this?\nASSISTANT: Cats' + ] # fmt: skip + self.assertEqual( + self.processor.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_llama_batched_regression(self): + # Let's make sure we test the preprocessing to replace what is used + model_id = "llava-hf/llava-1.5-7b-hf" + + # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before) + model = LlavaForConditionalGeneration.from_pretrained( + "llava-hf/llava-1.5-7b-hf", load_in_4bit=True, attn_implementation="eager" + ) + processor = AutoProcessor.from_pretrained(model_id, pad_token="") + + prompts = [ + "USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:", + "USER: \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: \nAnd this?\nASSISTANT:", + ] + image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw) + image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + + inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True).to( + torch_device + ) + + output = model.generate(**inputs, max_new_tokens=20) + + EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water', 'USER: \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: \nAnd this?\nASSISTANT: A cat sleeping on a bed.'] # fmt: skip + + self.assertEqual( + processor.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_torch + @require_vision + def test_batched_generation(self): + model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True) + + processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + + prompt1 = "\n\nUSER: What's the difference of two images?\nASSISTANT:" + prompt2 = "\nUSER: Describe the image.\nASSISTANT:" + prompt3 = "\nUSER: Describe the image.\nASSISTANT:" + url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D" + url2 = "https://images.unsplash.com/photo-1617258683320-61900b281ced?q=80&w=3087&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D" + image1 = Image.open(requests.get(url1, stream=True).raw) + image2 = Image.open(requests.get(url2, stream=True).raw) + + inputs = processor( + images=[image1, image2, image1, image2], + text=[prompt1, prompt2, prompt3], + return_tensors="pt", + padding=True, + ).to(torch_device) + + model = model.eval() + + EXPECTED_OUTPUT = [ + "\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while", + "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small", + "\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the", + ] + + generate_ids = model.generate(**inputs, max_new_tokens=20) + outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + self.assertEqual(outputs, EXPECTED_OUTPUT) + + def test_tokenizer_integration(self): + slow_tokenizer = AutoTokenizer.from_pretrained("liuhaotian/llava-v1.6-34b", use_fast=False) + slow_tokenizer.add_tokens("", True) + + fast_tokenizer = AutoTokenizer.from_pretrained( + "liuhaotian/llava-v1.6-34b", + bos_token="<|startoftext|>", + eos_token="<|endoftext|>", + from_slow=True, + legacy=False, + ) + fast_tokenizer.add_tokens("", True) + + prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" + EXPECTED_OUTPUT = ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n'] # fmt: skip + self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT) + self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT) + + @slow + @require_bitsandbytes + def test_generation_no_images(self): + model_id = "llava-hf/llava-1.5-7b-hf" + model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + # Prepare inputs with no images + inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device) + + # Make sure that `generate` works + _ = model.generate(**inputs, max_new_tokens=20) + + @slow + @require_bitsandbytes + def test_generation_siglip_backbone(self): + model_id = "llava-hf/llava-interleave-qwen-0.5b-hf" + model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype="float16", device_map=torch_device) + processor = AutoProcessor.from_pretrained(model_id) + + # check processing with expansion of inputs (w/o expansion should work with any backbone) + processor.vision_feature_select_strategy = "default" + processor.patch_size = 14 + + image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" + raw_image = Image.open(requests.get(image_file, stream=True).raw) + inputs = processor( + text="<|im_start|>user\n\nWhat are these?<|im_end|>\n<|im_start|>assistant", + images=raw_image, + return_tensors="pt", + ).to(torch_device, torch.float16) + + # Make sure that `generate` works + output = model.generate(**inputs, max_new_tokens=30) + + EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat" + self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT) + + @slow + def test_pixtral(self): + model_id = "mistral-community/pixtral-12b" + model = LlavaForConditionalGeneration.from_pretrained(model_id) + processor = AutoProcessor.from_pretrained(model_id) + + IMG_URLS = [ + Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw), + ] + PROMPT = "[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]" + + # image = Image.open(requests.get(url, stream=True).raw) + inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device) + generate_ids = model.generate(**inputs, max_new_tokens=500) + ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + print(ouptut) + + # fmt: off + EXPECTED_GENERATION = """ +Describe the images. +Certainly! Here are the descriptions of the images: + +1. **Image 1**: This image features a black dog with a glossy coat sitting on a wooden surface. The dog has a calm and attentive expression, looking directly at the camera. The wooden background has a rustic appearance with visible grain and texture. + +2. **Image 2**: This image captures a breathtaking view of a mountainous landscape. The mountains are rugged and covered with patches of green vegetation. The sky above is clear, and the scene conveys a sense of tranquility and natural beauty. + +3. **Image 3**: This image shows a beach scene during sunset. The waves are gently rolling onto the shore, and several people can be seen in the water, possibly surfing or swimming. The sky is painted with warm hues of orange and yellow, creating a serene and picturesque atmosphere. + +4. **Image 4**: This image depicts a narrow, winding path that cuts through a lush, green landscape. On either side of the path, there is dense grass and various trees, including a prominent tree with white blossoms. The sky is clear and blue, adding to the peaceful and inviting ambiance of the scene. + +These descriptions provide a detailed overview of the content and atmosphere of each image. +""" + # fmt: on + # check that both inputs are handled correctly and generate the same output + self.assertEqual(ouptut, EXPECTED_GENERATION) + + @slow + @require_bitsandbytes + def test_pixtral_4bit(self): + model_id = "mistral-community/pixtral-12b" + model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + IMG_URLS = [ + Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw), + ] + PROMPT = "[INST][IMG][IMG]Describe the images.[/INST]" + + inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(torch_device, torch.float16) + generate_ids = model.generate(**inputs, max_new_tokens=50) + output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + + EXPECTED_GENERATION = [ + # CUDA output + "Describe the images. The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador", + # XPU output + "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which covers the entire background. The dog appears to be the main focus", + ] # fmt: skip + self.assertTrue(output in EXPECTED_GENERATION) + + @slow + @require_bitsandbytes + def test_pixtral_batched(self): + model_id = "mistral-community/pixtral-12b" + model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + + IMG_URLS = [ + Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/17/150/500", stream=True).raw), + ] + PROMPT = [ + "[INST][IMG]What breed is the dog?[/INST]", + "[INST][IMG]What is shown in this image?[/INST]", + ] + + inputs = processor(text=PROMPT, images=IMG_URLS, padding=True, return_tensors="pt").to( + torch_device, torch.float16 + ) + generate_ids = model.generate(**inputs, max_new_tokens=50) + output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + + EXPECTED_GENERATION = [ + 'What breed is the dog?The dog in the image is a black Labrador Retriever.', + 'What is shown in this image?The image depicts a narrow, winding dirt path surrounded by lush greenery. The path is flanked by grass and shrubs on both sides. On the left side, there are tall trees and dense foliage, while on the right side, there' + ] # fmt: skip + self.assertEqual(output, EXPECTED_GENERATION) diff --git a/docs/transformers/tests/models/llava/test_processor_llava.py b/docs/transformers/tests/models/llava/test_processor_llava.py new file mode 100644 index 0000000000000000000000000000000000000000..51ed955b845f1df0f7165d7e45272463c18fdd41 --- /dev/null +++ b/docs/transformers/tests/models/llava/test_processor_llava.py @@ -0,0 +1,108 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import shutil +import tempfile +import unittest + +from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor +from transformers.testing_utils import require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import CLIPImageProcessor + +if is_torch_available: + pass + + +@require_vision +class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = LlavaProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + image_processor = CLIPImageProcessor(do_center_crop=False) + tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b") + tokenizer.add_special_tokens({"additional_special_tokens": [""]}) + processor_kwargs = cls.prepare_processor_dict() + processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs) + processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname, ignore_errors=True) + + @staticmethod + def prepare_processor_dict(): + return { + "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}", + "patch_size": 128, + "vision_feature_select_strategy": "default" + } # fmt: skip + + def test_chat_template_is_saved(self): + processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) + processor_dict_loaded = json.loads(processor_loaded.to_json_string()) + # chat templates aren't serialized to json in processors + self.assertFalse("chat_template" in processor_dict_loaded.keys()) + + # they have to be saved as separate file and loaded back from that file + # so we check if the same template is loaded + processor_dict = self.prepare_processor_dict() + self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None)) + + def test_can_load_various_tokenizers(self): + for checkpoint in ["Intel/llava-gemma-2b", "llava-hf/llava-1.5-7b-hf"]: + processor = LlavaProcessor.from_pretrained(checkpoint) + tokenizer = AutoTokenizer.from_pretrained(checkpoint) + self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__) + + def test_special_mm_token_truncation(self): + """Tests that special vision tokens do not get truncated when `truncation=True` is set.""" + + processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + + input_str = self.prepare_text_inputs(batch_size=2, modality="image") + image_input = self.prepare_image_inputs(batch_size=2) + + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=None, + padding=True, + ) + + with self.assertRaises(ValueError): + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=True, + padding=True, + max_length=5, + ) diff --git a/docs/transformers/tests/models/llava_next/__init__.py b/docs/transformers/tests/models/llava_next/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/llava_next/test_image_processing_llava_next.py b/docs/transformers/tests/models/llava_next/test_image_processing_llava_next.py new file mode 100644 index 0000000000000000000000000000000000000000..d3229c1bcd377ee518688d732efb64873a24545c --- /dev/null +++ b/docs/transformers/tests/models/llava_next/test_image_processing_llava_next.py @@ -0,0 +1,232 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD +from transformers.models.llava_next.image_processing_llava_next import select_best_resolution +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import LlavaNextImageProcessor + + if is_torchvision_available(): + from transformers import LlavaNextImageProcessorFast + + +class LlavaNextImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=OPENAI_CLIP_MEAN, + image_std=OPENAI_CLIP_STD, + do_convert_rgb=True, + ): + super().__init__() + size = size if size is not None else {"shortest_edge": 20} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape + def expected_output_image_shape(self, images): + return self.num_channels, self.crop_size["height"], self.crop_size["width"] + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = LlavaNextImageProcessor if is_vision_available() else None + fast_image_processing_class = LlavaNextImageProcessorFast if is_torchvision_available() else None + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext + def setUp(self): + super().setUp() + self.image_processor_tester = LlavaNextImageProcessingTester(self) + + @property + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + self.assertTrue(hasattr(image_processing, "image_grid_pinpoints")) + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 20}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) + self.assertEqual(image_processor.size, {"shortest_edge": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) + + def test_select_best_resolution(self): + possible_resolutions = [[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]] + + # Test with a square aspect ratio + best_resolution = select_best_resolution((336, 336), possible_resolutions) + self.assertEqual(best_resolution, (672, 336)) + + def test_call_pil(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = (1, 1445, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 1445, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + def test_call_numpy(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True) + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = (1, 1445, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 1445, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + def test_call_pytorch(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True) + + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = (1, 1445, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 1445, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + @unittest.skip( + reason="LlavaNextImageProcessor doesn't treat 4 channel PIL and numpy consistently yet" + ) # FIXME Amy + def test_call_numpy_4_channels(self): + pass + + def test_nested_input(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) + + # Test batched as a list of images + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 1445, 3, 18, 18) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) + + # Test batched as a nested list of images, where each sublist is one batch + image_inputs_nested = [image_inputs[:3], image_inputs[3:]] + encoded_images_nested = image_processing(image_inputs_nested, return_tensors="pt").pixel_values + expected_output_image_shape = (7, 1445, 3, 18, 18) + self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape) + + # Image processor should return same pixel values, independently of ipnut format + self.assertTrue((encoded_images_nested == encoded_images).all()) diff --git a/docs/transformers/tests/models/llava_next/test_modeling_llava_next.py b/docs/transformers/tests/models/llava_next/test_modeling_llava_next.py new file mode 100644 index 0000000000000000000000000000000000000000..3b9fc36521e801763c56ab417d3a71bf379af6c3 --- /dev/null +++ b/docs/transformers/tests/models/llava_next/test_modeling_llava_next.py @@ -0,0 +1,552 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Llava-NeXT model.""" + +import unittest + +import requests +from huggingface_hub import hf_hub_download +from parameterized import parameterized + +from transformers import ( + AutoProcessor, + LlavaNextConfig, + LlavaNextForConditionalGeneration, + is_torch_available, + is_vision_available, +) +from transformers.testing_utils import ( + cleanup, + require_bitsandbytes, + require_torch, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + _config_zero_init, + floats_tensor, + ids_tensor, +) + + +if is_torch_available(): + import torch + + from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches + + +if is_vision_available(): + from PIL import Image + + +class LlavaNextVisionText2TextModelTester: + def __init__( + self, + parent, + ignore_index=-100, + image_token_index=0, + projector_hidden_act="gelu", + seq_length=7, + vision_feature_select_strategy="default", + vision_feature_layer=-1, + text_config={ + "model_type": "llama", + "seq_length": 7, + "is_training": True, + "use_input_mask": True, + "use_token_type_ids": False, + "use_labels": True, + "vocab_size": 99, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 580, + "type_vocab_size": 16, + "type_sequence_label_size": 2, + "initializer_range": 0.02, + "num_labels": 3, + "num_choices": 4, + "pad_token_id": 1, + }, + is_training=True, + vision_config={ + "image_size": 8, + "patch_size": 4, + "num_channels": 3, + "is_training": True, + "hidden_size": 32, + "projection_dim": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "dropout": 0.1, + "attention_dropout": 0.1, + "initializer_range": 0.02, + }, + ): + self.parent = parent + self.ignore_index = ignore_index + self.image_token_index = image_token_index + self.projector_hidden_act = projector_hidden_act + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + self.text_config = text_config + self.vision_config = vision_config + self.pad_token_id = text_config["pad_token_id"] + + self.num_hidden_layers = text_config["num_hidden_layers"] + self.vocab_size = text_config["vocab_size"] + self.hidden_size = text_config["hidden_size"] + self.num_attention_heads = text_config["num_attention_heads"] + self.is_training = is_training + + self.batch_size = 3 + self.num_channels = 3 + self.image_size = 30 + self.image_grid_pinpoints = [[16, 16]] + self.num_image_tokens = 24 + self.seq_length = seq_length + self.num_image_tokens + self.encoder_seq_length = self.seq_length + + def get_config(self): + return LlavaNextConfig( + text_config=self.text_config, + vision_config=self.vision_config, + ignore_index=self.ignore_index, + image_token_index=self.image_token_index, + projector_hidden_act=self.projector_hidden_act, + vision_feature_select_strategy=self.vision_feature_select_strategy, + vision_feature_layer=self.vision_feature_layer, + image_grid_pinpoints=self.image_grid_pinpoints, + image_seq_length=self.num_image_tokens, + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + 5, + self.vision_config["num_channels"], + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + config = self.get_config() + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 + attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) + input_ids[input_ids == config.image_token_index] = self.pad_token_id + + input_ids[:, : self.num_image_tokens] = config.image_token_index + + inputs_dict = { + "pixel_values": pixel_values, + "image_sizes": torch.tensor( + [[self.vision_config["image_size"], self.vision_config["image_size"]]] * self.batch_size + ), + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + +@require_torch +class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + """ + Model tester for `LlavaNextForConditionalGeneration`. + """ + + all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": LlavaNextForConditionalGeneration} if is_torch_available() else {} + test_pruning = False + test_head_masking = False + _is_composite = True + + def setUp(self): + self.model_tester = LlavaNextVisionText2TextModelTester(self) + common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaNextConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if "image_newline" in name: + continue + elif param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + # while some other models require pixel_values to be present + def test_inputs_embeds_matches_input_ids(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + inputs_embeds = model.get_input_embeddings()(input_ids) + + with torch.no_grad(): + out_ids = model(input_ids=input_ids, **inputs)[0] + out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] + torch.testing.assert_close(out_embeds, out_ids) + + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successful forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + image_sizes = input_dict["image_sizes"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + image_sizes = torch.cat([image_sizes, image_sizes], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + + @parameterized.expand( + [ + (-1,), + ([-1],), + ([-1, -2],), + ], + ) + def test_vision_feature_layers(self, vision_feature_layer): + """ + Test that we can use either one vision feature layer, or a list of + vision feature layers. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.vision_feature_layer = vision_feature_layer + + num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer) + hidden_size = config.vision_config.hidden_size + expected_features = hidden_size * num_feature_layers + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + # We should have the right number of input features, + # and should be able to run a forward pass without exploding + assert model.multi_modal_projector.linear_1.in_features == expected_features + model(**input_dict) + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip( + "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" + ) + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + pass + + +@require_torch +class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") + url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" + self.image = Image.open(requests.get(url, stream=True).raw) + + self.prompt = "[INST] \nWhat is shown in this image? [/INST]" + + def tearDown(self): + cleanup(torch_device, gc_collect=True) + + @slow + @require_bitsandbytes + def test_small_model_integration_test(self): + model = LlavaNextForConditionalGeneration.from_pretrained( + "llava-hf/llava-v1.6-mistral-7b-hf", + load_in_4bit=True, + ) + + inputs = self.processor(images=self.image, text=self.prompt, return_tensors="pt") + + # verify inputs against original implementation + filepath = hf_hub_download( + repo_id="nielsr/test-image", + filename="llava_1_6_input_ids.pt", + repo_type="dataset", + ) + original_input_ids = torch.load(filepath, map_location="cpu", weights_only=True) + # replace -200 by image_token_index (since we use token ID = 32000 for the image token) + # remove image token indices because HF impl expands image tokens `image_seq_length` times + original_input_ids = original_input_ids[original_input_ids != -200] + observed_input_ids = inputs.input_ids[inputs.input_ids != model.config.image_token_index] + assert original_input_ids[0].tolist() == observed_input_ids[0].tolist() + + filepath = hf_hub_download( + repo_id="nielsr/test-image", + filename="llava_1_6_pixel_values.pt", + repo_type="dataset", + ) + original_pixel_values = torch.load(filepath, map_location="cpu", weights_only=True) + assert torch.allclose(original_pixel_values, inputs.pixel_values.half()) + + # verify generation + output = model.generate(**inputs, max_new_tokens=100) + EXPECTED_DECODED_TEXT = '[INST] \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays values for multiple quantitative variables represented on axes starting from the same point. This particular radar chart is showing the performance of various models or systems across different metrics or datasets.\n\nThe chart is divided into several sections, each representing a different model or dataset. The axes represent different metrics or datasets, such as "MMM-Vet," "MMM-Bench," "L' # fmt: skip + + self.assertEqual( + self.processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_batch(self): + model = LlavaNextForConditionalGeneration.from_pretrained( + "llava-hf/llava-v1.6-mistral-7b-hf", load_in_4bit=True + ) + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + cats_image = Image.open(requests.get(url, stream=True).raw) + + inputs = self.processor( + images=[self.image, cats_image], + text=[self.prompt, self.prompt], + return_tensors="pt", + padding=True, + ).to(torch_device) + + # it should not matter whether two images are the same size or not + output = model.generate(**inputs, max_new_tokens=20) + + EXPECTED_DECODED_TEXT = ['[INST] \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays', '[INST] \nWhat is shown in this image? [/INST] The image shows two cats lying on a pink surface, which appears to be a couch or a cush'] # fmt: skip + self.assertEqual( + self.processor.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_unk_token(self): + # related to (#29835) + model = LlavaNextForConditionalGeneration.from_pretrained( + "llava-hf/llava-v1.6-mistral-7b-hf", + load_in_4bit=True, + ) + + prompt_with_unk = "[INST] \nWhat is shown in this image? [/INST]" + inputs = self.processor(images=self.image, text=prompt_with_unk, return_tensors="pt") + + # verify single forward pass + inputs = inputs.to(torch_device) + with torch.no_grad(): + output = model(**inputs) + + # verify generation + output = model.generate(**inputs, max_new_tokens=40) + EXPECTED_DECODED_TEXT = '[INST] \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays values for multiple quantitative variables represented on axes starting from the same point. This particular radar chart' # fmt: skip + + self.assertEqual( + self.processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_batch_different_resolutions(self): + model = LlavaNextForConditionalGeneration.from_pretrained( + "llava-hf/llava-v1.6-mistral-7b-hf", + load_in_4bit=True, + ) + + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + lowres_url = "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e" + cats_image = Image.open(requests.get(url, stream=True).raw) + lowres_img = Image.open(requests.get(lowres_url, stream=True).raw) + + inputs = self.processor( + images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True + ).to(torch_device) + pixel_values = inputs["pixel_values"] + + # verify pixel values are padded correctly with 0 when one image has more num_patches than the other + image_num_patches = [ + image_size_to_num_patches( + image_size=imsize, + grid_pinpoints=model.config.image_grid_pinpoints, + patch_size=model.config.vision_config.image_size, + ) + for imsize in inputs["image_sizes"] + ] + for pix_val, num_patch in zip(pixel_values, image_num_patches): + self.assertTrue(torch.all(pix_val[num_patch:] == 0)) # pad on the right + for i in range(num_patch): + self.assertFalse(torch.all(pix_val[i : i + 1] == 0)) # no padding expected in any of patches + + # verify generation + output = model.generate(**inputs, max_new_tokens=50) + EXPECTED_DECODED_TEXT = '[INST] \nWhat is shown in this image? [/INST] The image shows two deer, likely fawns, in a grassy area with trees in the background. The setting appears to be a forest or woodland, and the time of day seems to be either dawn or dusk, given the soft' # fmt: skip + self.assertEqual( + self.processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_batch_matches_single(self): + model = LlavaNextForConditionalGeneration.from_pretrained( + "llava-hf/llava-v1.6-mistral-7b-hf", + load_in_4bit=True, + ) + + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + lowres_url = "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e" + cats_image = Image.open(requests.get(url, stream=True).raw) + lowres_img = Image.open(requests.get(lowres_url, stream=True).raw) + + inputs_batched = self.processor( + images=[lowres_img, cats_image], text=[self.prompt, self.prompt], return_tensors="pt", padding=True + ).to(torch_device) + + inputs_single = self.processor(images=lowres_img, text=self.prompt, return_tensors="pt", padding=True).to( + torch_device + ) + + # verify generation + output_batched = model.generate(**inputs_batched, max_new_tokens=50) + output_single = model.generate(**inputs_single, max_new_tokens=50) + self.assertEqual( + self.processor.decode(output_batched[0], skip_special_tokens=True), + self.processor.decode(output_single[0], skip_special_tokens=True), + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_full_vision_state_selection(self): + model = LlavaNextForConditionalGeneration.from_pretrained( + "llava-hf/llava-v1.6-mistral-7b-hf", + load_in_4bit=True, + ) + # test that changing `strategy` won't error out + model.vision_feature_select_strategy = "full" + + inputs = self.processor(self.prompt, self.image, return_tensors="pt").to(model.device) + + # verify generation + output = model.generate(**inputs, max_new_tokens=30) + EXPECTED_DECODED_TEXT = '[INST] \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays values for multiple quantitative variables represented on axes' # fmt: skip + + self.assertEqual( + self.processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + def test_granite_vision(self): + """ + Check the expected output of a granite vision model, which leverages + multiple vision feature layers and a visual encoder with no CLS (siglip). + """ + granite_model_path = "ibm-granite/granite-vision-3.1-2b-preview" + model = LlavaNextForConditionalGeneration.from_pretrained(granite_model_path) + self.processor = AutoProcessor.from_pretrained(granite_model_path) + prompt = "<|user|>\n\nWhat is shown in this image?\n<|assistant|>\n" + inputs = self.processor(prompt, self.image, return_tensors="pt").to(model.device) + + # verify generation + output = model.generate(**inputs, max_new_tokens=30) + EXPECTED_DECODED_TEXT = "<|user|>\n\nWhat is shown in this image?\n<|assistant|>\nThe image displays a radar chart comparing the performance of various machine learning models." # fmt: skip + self.assertEqual( + self.processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) diff --git a/docs/transformers/tests/models/llava_next/test_processor_llava_next.py b/docs/transformers/tests/models/llava_next/test_processor_llava_next.py new file mode 100644 index 0000000000000000000000000000000000000000..a565212b49e7698613be6c61418063d2cf429a13 --- /dev/null +++ b/docs/transformers/tests/models/llava_next/test_processor_llava_next.py @@ -0,0 +1,99 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import tempfile +import unittest + +import torch + +from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextProcessor +from transformers.testing_utils import ( + require_vision, +) +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from transformers import LlavaNextImageProcessor + + +@require_vision +class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = LlavaNextProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + + image_processor = LlavaNextImageProcessor() + tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b") + tokenizer.add_special_tokens({"additional_special_tokens": [""]}) + processor_kwargs = cls.prepare_processor_dict() + processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs) + processor.save_pretrained(cls.tmpdirname) + cls.image_token = processor.image_token + + def get_tokenizer(self, **kwargs): + return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + @staticmethod + def prepare_processor_dict(): + return { + "chat_template": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}", + "patch_size": 128, + "vision_feature_select_strategy": "default" + } # fmt: skip + + # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved + def test_chat_template_is_saved(self): + processor_loaded = self.processor_class.from_pretrained(self.tmpdirname) + processor_dict_loaded = json.loads(processor_loaded.to_json_string()) + # chat templates aren't serialized to json in processors + self.assertFalse("chat_template" in processor_dict_loaded.keys()) + + # they have to be saved as separate file and loaded back from that file + # so we check if the same template is loaded + processor_dict = self.prepare_processor_dict() + self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None)) + + def test_image_token_filling(self): + processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf") + processor.patch_size = 14 + processor.vision_feature_select_strategy = "default" + # Important to check with non square image + image = torch.randint(0, 2, (3, 500, 316)) + expected_image_tokens = 1526 + image_token_index = 32000 + + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, + ] + inputs = processor( + text=[processor.apply_chat_template(messages)], + images=[image], + return_tensors="pt", + ) + image_tokens = (inputs["input_ids"] == image_token_index).sum().item() + self.assertEqual(expected_image_tokens, image_tokens) diff --git a/docs/transformers/tests/models/llava_next_video/__init__.py b/docs/transformers/tests/models/llava_next_video/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/tests/models/llava_next_video/test_image_processing_llava_next_video.py b/docs/transformers/tests/models/llava_next_video/test_image_processing_llava_next_video.py new file mode 100644 index 0000000000000000000000000000000000000000..0a7cc1eace74a4441fb55a3714be81695d481ee5 --- /dev/null +++ b/docs/transformers/tests/models/llava_next_video/test_image_processing_llava_next_video.py @@ -0,0 +1,218 @@ +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import LlavaNextVideoImageProcessor + + +class LlavaNextVideoProcessingTester: + def __init__( + self, + parent, + batch_size=5, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=80, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=OPENAI_CLIP_MEAN, + image_std=OPENAI_CLIP_STD, + do_convert_rgb=True, + ): + size = size if size is not None else {"shortest_edge": 20} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape + def expected_output_image_shape(self, images): + return self.num_channels, self.crop_size["height"], self.crop_size["width"] + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False): + images = prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + # let's simply copy the frames to fake a long video-clip + if numpify or torchify: + videos = [] + for image in images: + if numpify: + video = image[None, ...].repeat(8, 0) + else: + video = image[None, ...].repeat(8, 1, 1, 1) + videos.append(video) + else: + videos = [] + for pil_image in images: + videos.append([pil_image] * 8) + + return videos + + +@require_torch +@require_vision +class LlavaNextVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = LlavaNextVideoImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = LlavaNextVideoProcessingTester(self) + + @property + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 20}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) + self.assertEqual(image_processor.size, {"shortest_edge": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) + + def test_call_pil(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True) + for video in video_inputs: + self.assertIsInstance(video[0], Image.Image) + + # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!) + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos + expected_output_video_shape = (1, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos + expected_output_video_shape = (5, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + def test_call_numpy(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, numpify=True) + for video in video_inputs: + self.assertIsInstance(video, np.ndarray) + + # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!) + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos + expected_output_video_shape = (1, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos + expected_output_video_shape = (5, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + def test_call_pytorch(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, torchify=True) + for video in video_inputs: + self.assertIsInstance(video, torch.Tensor) + + # Test not batched input + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos + expected_output_video_shape = (1, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos + expected_output_video_shape = (5, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + @unittest.skip("LlavaNextVideoImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") + def test_call_numpy_4_channels(self): + pass diff --git a/docs/transformers/tests/models/llava_next_video/test_modeling_llava_next_video.py b/docs/transformers/tests/models/llava_next_video/test_modeling_llava_next_video.py new file mode 100644 index 0000000000000000000000000000000000000000..47c71d9c751bb638fe33e526ccfe07671493fa65 --- /dev/null +++ b/docs/transformers/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -0,0 +1,476 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Llava-NeXT-Video model.""" + +import unittest + +import numpy as np +from huggingface_hub import hf_hub_download +from parameterized import parameterized + +from transformers import ( + AutoProcessor, + LlavaNextVideoConfig, + LlavaNextVideoForConditionalGeneration, + is_torch_available, + is_vision_available, +) +from transformers.testing_utils import ( + cleanup, + require_bitsandbytes, + require_torch, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + _config_zero_init, + floats_tensor, + ids_tensor, +) + + +if is_torch_available(): + import torch + + +if is_vision_available(): + from PIL import Image + + +class LlavaNextVideoVisionText2TextModelTester: + def __init__( + self, + parent, + ignore_index=-100, + image_token_index=0, + video_token_index=1, + projector_hidden_act="gelu", + seq_length=7, + vision_feature_select_strategy="default", + vision_feature_layer=-1, + text_config={ + "model_type": "llama", + "seq_length": 7, + "is_training": True, + "use_input_mask": True, + "use_token_type_ids": False, + "use_labels": True, + "vocab_size": 99, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 580, + "type_vocab_size": 16, + "type_sequence_label_size": 2, + "initializer_range": 0.02, + "num_labels": 3, + "num_choices": 4, + "pad_token_id": 2, + }, + is_training=True, + vision_config={ + "image_size": 8, + "patch_size": 4, + "num_channels": 3, + "is_training": True, + "hidden_size": 32, + "projection_dim": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "dropout": 0.1, + "attention_dropout": 0.1, + "initializer_range": 0.02, + }, + ): + self.parent = parent + self.ignore_index = ignore_index + self.image_token_index = image_token_index + self.video_token_index = video_token_index + self.projector_hidden_act = projector_hidden_act + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + self.text_config = text_config + self.vision_config = vision_config + self.pad_token_id = text_config["pad_token_id"] + + self.num_hidden_layers = text_config["num_hidden_layers"] + self.vocab_size = text_config["vocab_size"] + self.hidden_size = text_config["hidden_size"] + self.num_attention_heads = text_config["num_attention_heads"] + self.is_training = is_training + + self.batch_size = 3 + self.num_channels = 3 + self.image_size = 30 + + self.image_grid_pinpoints = [[16, 16]] + self.num_image_tokens = 24 + self.num_video_tokens = 8 + self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens + + def get_config(self): + return LlavaNextVideoConfig( + text_config=self.text_config, + vision_config=self.vision_config, + ignore_index=self.ignore_index, + image_token_index=self.image_token_index, + video_token_index=self.video_token_index, + projector_hidden_act=self.projector_hidden_act, + vision_feature_select_strategy=self.vision_feature_select_strategy, + vision_feature_layer=self.vision_feature_layer, + image_grid_pinpoints=self.image_grid_pinpoints, + video_seq_length=self.num_video_tokens, + image_seq_length=self.num_image_tokens, + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + 5, + self.vision_config["num_channels"], + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + pixel_values_videos = floats_tensor( + [ + self.batch_size, + 8, + self.vision_config["num_channels"], + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + config = self.get_config() + + return config, pixel_values, pixel_values_videos + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs() + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 + attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) + + input_ids[input_ids == config.image_token_index] = self.pad_token_id + input_ids[input_ids == config.video_token_index] = self.pad_token_id + input_ids[:, : self.num_image_tokens] = config.image_token_index + input_ids[:, self.num_image_tokens : self.num_video_tokens + self.num_image_tokens] = config.video_token_index + + inputs_dict = { + "pixel_values": pixel_values, + "pixel_values_videos": pixel_values_videos, + "image_sizes": torch.tensor( + [[self.vision_config["image_size"], self.vision_config["image_size"]]] * self.batch_size + ), + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + +@require_torch +class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + """ + Model tester for `LlavaNextVideoForConditionalGeneration`. + """ + + all_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else () + test_pruning = False + test_head_masking = False + _is_composite = True + + def setUp(self): + self.model_tester = LlavaNextVideoVisionText2TextModelTester(self) + common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaNextVideoConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if "image_newline" in name: + continue + elif param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + del inputs["pixel_values_videos"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + # while some other models require pixel_values to be present + def test_inputs_embeds_matches_input_ids(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + del inputs["pixel_values_videos"] + + inputs_embeds = model.get_input_embeddings()(input_ids) + + with torch.no_grad(): + out_ids = model(input_ids=input_ids, **inputs)[0] + out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] + torch.testing.assert_close(out_embeds, out_ids) + + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successful forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + image_sizes = input_dict["image_sizes"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + image_sizes = torch.cat([image_sizes, image_sizes], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + + @parameterized.expand( + [ + (-1,), + ([-1],), + ([-1, -2],), + ], + ) + def test_vision_feature_layers(self, vision_feature_layer): + """ + Test that we can use either one vision feature layer, or a list of + vision feature layers. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.vision_feature_layer = vision_feature_layer + + num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer) + hidden_size = config.vision_config.hidden_size + expected_features = hidden_size * num_feature_layers + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + # We should have the right number of input features, + # and should be able to run a forward pass without exploding + assert model.multi_modal_projector.linear_1.in_features == expected_features + model(**input_dict) + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip("FlashAttention only support fp16 and bf16 data type") + def test_flash_attn_2_fp32_ln(self): + pass + + @unittest.skip( + "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" + ) + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + pass + + +@require_torch +class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf") + image_file = hf_hub_download( + repo_id="raushan-testing-hf/images_test", filename="llava_v1_5_radar.jpg", repo_type="dataset" + ) + video_file = hf_hub_download( + repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" + ) + self.image = Image.open(image_file) + self.video = np.load(video_file) + self.prompt_image = "USER: \nWhat is shown in this image? ASSISTANT:" + self.prompt_video = "USER: