Student0809 commited on Jul 24, 2025

Commit

b381930

verified ·

1 Parent(s): 326a7fe

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
docs/transformers/tests/fixtures/spiece.model +3 -0
docs/transformers/tests/fixtures/test_sentencepiece.model +3 -0
docs/transformers/tests/fixtures/test_sentencepiece_bpe.model +3 -0
docs/transformers/tests/fixtures/test_sentencepiece_bpe_char.model +3 -0
docs/transformers/tests/fixtures/test_sentencepiece_no_bos.model +3 -0
docs/transformers/tests/fixtures/test_sentencepiece_with_bytefallback.model +3 -0
docs/transformers/tests/fixtures/tests_samples/COCO/000000004016.png +3 -0
docs/transformers/tests/fixtures/tests_samples/COCO/000000039769.png +3 -0
docs/transformers/tests/models/byt5/__init__.py +0 -0
docs/transformers/tests/models/byt5/test_tokenization_byt5.py +366 -0
docs/transformers/tests/models/camembert/__init__.py +0 -0
docs/transformers/tests/models/camembert/test_modeling_tf_camembert.py +55 -0
docs/transformers/tests/models/camembert/test_tokenization_camembert.py +220 -0
docs/transformers/tests/models/canine/__init__.py +0 -0
docs/transformers/tests/models/canine/test_modeling_canine.py +571 -0
docs/transformers/tests/models/canine/test_tokenization_canine.py +339 -0
docs/transformers/tests/models/chameleon/__init__.py +0 -0
docs/transformers/tests/models/chameleon/test_image_processing_chameleon.py +204 -0
docs/transformers/tests/models/chameleon/test_modeling_chameleon.py +481 -0
docs/transformers/tests/models/chameleon/test_processor_chameleon.py +76 -0
docs/transformers/tests/models/chinese_clip/__init__.py +0 -0
docs/transformers/tests/models/chinese_clip/test_image_processing_chinese_clip.py +175 -0
docs/transformers/tests/models/chinese_clip/test_modeling_chinese_clip.py +762 -0
docs/transformers/tests/models/chinese_clip/test_processor_chinese_clip.py +217 -0
docs/transformers/tests/models/clap/__init__.py +0 -0
docs/transformers/tests/models/clap/test_feature_extraction_clap.py +546 -0
docs/transformers/tests/models/clap/test_modeling_clap.py +755 -0
docs/transformers/tests/models/clap/test_processor_clap.py +125 -0
docs/transformers/tests/models/clip/__init__.py +0 -0
docs/transformers/tests/models/clip/test_image_processing_clip.py +128 -0
docs/transformers/tests/models/clip/test_modeling_clip.py +948 -0
docs/transformers/tests/models/clip/test_modeling_flax_clip.py +468 -0
docs/transformers/tests/models/clip/test_modeling_tf_clip.py +662 -0
docs/transformers/tests/models/clip/test_processor_clip.py +199 -0
docs/transformers/tests/models/clip/test_tokenization_clip.py +192 -0
docs/transformers/tests/models/clipseg/__init__.py +0 -0
docs/transformers/tests/models/clipseg/test_modeling_clipseg.py +714 -0
docs/transformers/tests/models/clipseg/test_processor_clipseg.py +194 -0
docs/transformers/tests/models/clvp/__init__.py +0 -0
docs/transformers/tests/models/clvp/test_feature_extraction_clvp.py +240 -0
docs/transformers/tests/models/clvp/test_modeling_clvp.py +640 -0
docs/transformers/tests/models/clvp/test_processor_clvp.py +136 -0
docs/transformers/tests/models/clvp/test_tokenization_clvp.py +317 -0
docs/transformers/tests/models/code_llama/__init__.py +0 -0
docs/transformers/tests/models/code_llama/test_tokenization_code_llama.py +653 -0
docs/transformers/tests/models/codegen/__init__.py +0 -0
docs/transformers/tests/models/codegen/test_modeling_codegen.py +492 -0
docs/transformers/tests/models/codegen/test_tokenization_codegen.py +329 -0
docs/transformers/tests/models/cohere/__init__.py +0 -0

.gitattributes CHANGED Viewed

@@ -36,3 +36,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 asset/banner.png filter=lfs diff=lfs merge=lfs -text
 docs/resources/web-ui.jpg filter=lfs diff=lfs merge=lfs -text
 docs/resources/dpo_data.png filter=lfs diff=lfs merge=lfs -text

 asset/banner.png filter=lfs diff=lfs merge=lfs -text
 docs/resources/web-ui.jpg filter=lfs diff=lfs merge=lfs -text
 docs/resources/dpo_data.png filter=lfs diff=lfs merge=lfs -text
+docs/transformers/tests/fixtures/tests_samples/COCO/000000039769.png filter=lfs diff=lfs merge=lfs -text
+docs/transformers/tests/fixtures/tests_samples/COCO/000000004016.png filter=lfs diff=lfs merge=lfs -text

docs/transformers/tests/fixtures/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fefb02b667a6c5c2fe27602d28e5fb3428f66ab89c7d6f388e7c8d44a02d0336
+size 760289

docs/transformers/tests/fixtures/test_sentencepiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8dfd1eae4522281b1b839eab877a791befec7a1663a41c814c77d9c89c748f2d
+size 253154

docs/transformers/tests/fixtures/test_sentencepiece_bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4de78f5d11ee09141165d31da7dad97e809dd6ee7b52a0cbc6d76a973028286
+size 251527

docs/transformers/tests/fixtures/test_sentencepiece_bpe_char.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
+size 238473

docs/transformers/tests/fixtures/test_sentencepiece_no_bos.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f3af97c2e7bc51d781e7440aa33deee7f482eac819d23fd24af80e7b4ce2646
+size 253134

docs/transformers/tests/fixtures/test_sentencepiece_with_bytefallback.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c61ecce43369fc3bab9566464f0e71f3ad75dc2319a5aadc2a561e3e312502e3
+size 270096

docs/transformers/tests/fixtures/tests_samples/COCO/000000004016.png ADDED Viewed

Git LFS Details

SHA256: 88bb5c2a7f00292db072f7d95638c6f5a46c7d824fff6412327e636eedc1fc24
Pointer size: 131 Bytes
Size of remote file: 636 kB

docs/transformers/tests/fixtures/tests_samples/COCO/000000039769.png ADDED Viewed

Git LFS Details

SHA256: cf6f3c4befa148732c7453e0de5afab00f682427435fead2d88b07a9615cdac2
Pointer size: 131 Bytes
Size of remote file: 694 kB

docs/transformers/tests/models/byt5/__init__.py ADDED Viewed

File without changes

docs/transformers/tests/models/byt5/test_tokenization_byt5.py ADDED Viewed

	@@ -0,0 +1,366 @@

+# Copyright 2020 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import re
+import shutil
+import tempfile
+import unittest
+from functools import lru_cache
+from transformers import AddedToken, BatchEncoding, ByT5Tokenizer
+from transformers.utils import cached_property, is_tf_available, is_torch_available
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
+if is_torch_available():
+    FRAMEWORK = "pt"
+elif is_tf_available():
+    FRAMEWORK = "tf"
+else:
+    FRAMEWORK = "jax"
+class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = ByT5Tokenizer
+    test_rust_tokenizer = False
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        tokenizer = ByT5Tokenizer()
+        tokenizer.save_pretrained(cls.tmpdirname)
+    @cached_property
+    def t5_base_tokenizer(self):
+        return ByT5Tokenizer.from_pretrained("google/byt5-small")
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> ByT5Tokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> tuple[str, list]:
+        # XXX The default common tokenizer tests assume that every ID is decodable on its own.
+        # This assumption is invalid for ByT5 because single bytes might not be
+        # valid utf-8 (byte 128 for instance).
+        # Here we're overriding the smallest possible method to provide
+        # a clean sequence without making the same assumption.
+        toks = []
+        for i in range(len(tokenizer)):
+            try:
+                tok = tokenizer.decode([i], clean_up_tokenization_spaces=False)
+            except UnicodeDecodeError:
+                pass
+            toks.append((i, tok))
+        toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
+        toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
+        if max_length is not None and len(toks) > max_length:
+            toks = toks[:max_length]
+        if min_length is not None and len(toks) < min_length and len(toks) > 0:
+            while len(toks) < min_length:
+                toks = toks + toks
+        # toks_str = [t[1] for t in toks]
+        toks_ids = [t[0] for t in toks]
+        # Ensure consistency
+        output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False)
+        if " " not in output_txt and len(toks_ids) > 1:
+            output_txt = (
+                tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False)
+                + " "
+                + tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False)
+            )
+        if with_prefix_space:
+            output_txt = " " + output_txt
+        output_ids = tokenizer.encode(output_txt, add_special_tokens=False)
+        return output_txt, output_ids
+    def test_eos_treatment(self):
+        tokenizer = self.t5_base_tokenizer
+        batch_with_eos_added = tokenizer(["hi</s>", "I went to the gym</s>", "</s>"])
+        batch_without_eos_added = tokenizer(["hi", "I went to the gym", ""])
+        self.assertListEqual(batch_with_eos_added["input_ids"], batch_without_eos_added["input_ids"])
+    def test_multibytes_char(self):
+        tokenizer = self.t5_base_tokenizer
+        src_text = "Unicode €."
+        encoded = tokenizer(src_text)
+        encoded_ids = [88, 113, 108, 102, 114, 103, 104, 35, 229, 133, 175, 49, 1]
+        self.assertEqual(encoded["input_ids"], encoded_ids)
+        # decoding
+        decoded = tokenizer.decode(encoded_ids)
+        self.assertEqual(decoded, "Unicode €.</s>")
+        encoded = tokenizer("e è é ê ë")
+        encoded_ids = [104, 35, 198, 171, 35, 198, 172, 35, 198, 173, 35, 198, 174, 1]
+        self.assertEqual(encoded["input_ids"], encoded_ids)
+        # decoding
+        decoded = tokenizer.decode(encoded_ids)
+        self.assertEqual(decoded, "e è é ê ë</s>")
+        # encode/decode, but with `encode` instead of `__call__`
+        self.assertEqual(tokenizer.decode(tokenizer.encode("e è é ê ë")), "e è é ê ë</s>")
+    def test_prepare_batch_integration(self):
+        tokenizer = self.t5_base_tokenizer
+        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
+        expected_src_tokens = [68, 35, 111, 114, 113, 106, 35, 115, 100, 117, 100, 106, 117, 100, 115, 107, 35, 105, 114, 117, 35, 118, 120, 112, 112, 100, 117, 108, 125, 100, 119, 108, 114, 113, 49, 1, 0]  # fmt: skip
+        batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK)
+        self.assertIsInstance(batch, BatchEncoding)
+        if FRAMEWORK != "jax":
+            result = list(batch.input_ids.numpy()[0])
+        else:
+            result = list(batch.input_ids.tolist()[0])
+        self.assertListEqual(expected_src_tokens, result)
+        self.assertEqual((2, 37), batch.input_ids.shape)
+        self.assertEqual((2, 37), batch.attention_mask.shape)
+    def test_empty_target_text(self):
+        tokenizer = self.t5_base_tokenizer
+        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
+        batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK)
+        # check if input_ids are returned and no decoder_input_ids
+        self.assertIn("input_ids", batch)
+        self.assertIn("attention_mask", batch)
+        self.assertNotIn("decoder_input_ids", batch)
+        self.assertNotIn("decoder_attention_mask", batch)
+    def test_max_length_integration(self):
+        tokenizer = self.t5_base_tokenizer
+        tgt_text = [
+            "Summary of the text.",
+            "Another summary.",
+        ]
+        targets = tokenizer(
+            text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
+        )
+        self.assertEqual(32, targets["input_ids"].shape[1])
+    def test_eos_in_input(self):
+        tokenizer = self.t5_base_tokenizer
+        src_text = ["A long paragraph for summarization. </s>"]
+        tgt_text = ["Summary of the text. </s>"]
+        expected_src_tokens = [68, 35, 111, 114, 113, 106, 35, 115, 100, 117, 100, 106, 117, 100, 115, 107, 35, 105, 114, 117, 35, 118, 120, 112, 112, 100, 117, 108, 125, 100, 119, 108, 114, 113, 49, 35, 1]  # fmt: skip
+        expected_tgt_tokens = [86, 120, 112, 112, 100, 117, 124, 35, 114, 105, 35, 119, 107, 104, 35, 119, 104, 123, 119, 49, 35, 1]  # fmt: skip
+        batch = tokenizer(src_text, text_target=tgt_text)
+        self.assertEqual(expected_src_tokens, batch["input_ids"][0])
+        self.assertEqual(expected_tgt_tokens, batch["labels"][0])
+    # cannot use default save_and_load_tokenizer test method because tokenizer has no vocab
+    def test_save_and_load_tokenizer(self):
+        # safety check on max_len default value so we are sure the test works
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                self.assertNotEqual(tokenizer.model_max_length, 42)
+        # Now let's start the test
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                tmpdirname = tempfile.mkdtemp()
+                sample_text = " He is very happy, UNwant\u00e9d,running"
+                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+                tokenizer.save_pretrained(tmpdirname)
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+                self.assertListEqual(before_tokens, after_tokens)
+                shutil.rmtree(tmpdirname)
+        tokenizers = self.get_tokenizers(model_max_length=42)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                tmpdirname = tempfile.mkdtemp()
+                sample_text = " He is very happy, UNwant\u00e9d,running"
+                tokenizer.add_tokens(["bim", "bambam"])
+                additional_special_tokens = tokenizer.additional_special_tokens
+                additional_special_tokens.append("new_additional_special_token")
+                tokenizer.add_special_tokens(
+                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+                )
+                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+                tokenizer.save_pretrained(tmpdirname)
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+                self.assertListEqual(before_tokens, after_tokens)
+                self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens)
+                self.assertEqual(after_tokenizer.model_max_length, 42)
+                tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
+                self.assertEqual(tokenizer.model_max_length, 43)
+                shutil.rmtree(tmpdirname)
+    # There is a conflict between the default value of extra_ids and adding a new special token through additional_special_tokens
+    # We need to add the extra_ids in the list of the arg additional_special_tokens
+    def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
+        tokenizer_list = []
+        if self.test_slow_tokenizer:
+            tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
+        if self.test_rust_tokenizer:
+            tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
+        for tokenizer_class, tokenizer_utils in tokenizer_list:
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tokenizer_utils.save_pretrained(tmp_dir)
+                with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
+                    special_tokens_map = json.load(json_file)
+                with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
+                    tokenizer_config = json.load(json_file)
+                added_tokens_extra_ids = [f"<extra_id_{i}>" for i in range(125)]
+                special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [
+                    "an_additional_special_token"
+                ]
+                tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [
+                    "an_additional_special_token"
+                ]
+                with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
+                    json.dump(special_tokens_map, outfile)
+                with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
+                    json.dump(tokenizer_config, outfile)
+                # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
+                # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
+                # "special_tokens_map.json" files
+                tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
+                    tmp_dir,
+                )
+                self.assertIn(
+                    "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
+                )
+                # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
+                self.assertEqual(
+                    ["an_additional_special_token"],
+                    tokenizer_without_change_in_init.convert_ids_to_tokens(
+                        tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
+                    ),
+                )
+                # Now we test that we can change the value of additional_special_tokens in the from_pretrained
+                new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)]
+                tokenizer = tokenizer_class.from_pretrained(
+                    tmp_dir,
+                    additional_special_tokens=new_added_tokens,
+                )
+                self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
+                self.assertEqual(
+                    ["a_new_additional_special_token"],
+                    tokenizer.convert_ids_to_tokens(
+                        tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
+                    ),
+                )
+    def test_decode_single_bytes(self):
+        tokenizer_list = []
+        if self.test_slow_tokenizer:
+            tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
+        if self.test_rust_tokenizer:
+            tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
+        for tokenizer_class, tokenizer_utils in tokenizer_list:
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tokenizer_utils.save_pretrained(tmp_dir)
+                tokenizer = tokenizer_class.from_pretrained(tmp_dir)
+                self.assertTrue(tokenizer.decode([255]) == "")
+    @unittest.skip(reason="ByT5Tokenizer does not have a vocabulary")
+    def test_get_vocab(self):
+        pass
+    @unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string")
+    def test_pretokenized_inputs(self):
+        pass
+    @unittest.skip(reason="ByT5Tokenizer does not have a vocabulary")
+    def test_conversion_reversible(self):
+        pass
+    def test_convert_tokens_to_string_format(self):
+        # The default common tokenizer tests uses invalid tokens for ByT5 that can only accept one-character strings
+        # and special added tokens as tokens
+        tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                tokens = ["t", "h", "i", "s", " ", "i", "s", " ", "a", " ", "t", "e", "x", "t", "</s>"]
+                string = tokenizer.convert_tokens_to_string(tokens)
+                self.assertIsInstance(string, str)
+    # We need a different implementation of the test of the same name defined in TokenizerTesterMixin because this tokenizer
+    # doesn't have a vocab
+    def test_tokenizers_common_ids_setters(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                attributes_list = [
+                    "bos_token",
+                    "eos_token",
+                    "unk_token",
+                    "sep_token",
+                    "pad_token",
+                    "cls_token",
+                    "mask_token",
+                ]
+                token_id_to_test_setters = 0
+                token_to_test_setters = tokenizer.convert_ids_to_tokens(
+                    token_id_to_test_setters, skip_special_tokens=False
+                )
+                for attr in attributes_list:
+                    setattr(tokenizer, attr + "_id", None)
+                    self.assertEqual(getattr(tokenizer, attr), None)
+                    self.assertEqual(getattr(tokenizer, attr + "_id"), None)
+                    setattr(tokenizer, attr + "_id", token_id_to_test_setters)
+                    self.assertEqual(getattr(tokenizer, attr), token_to_test_setters)
+                    self.assertEqual(getattr(tokenizer, attr + "_id"), token_id_to_test_setters)
+                setattr(tokenizer, "additional_special_tokens_ids", [])
+                self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [])
+                self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [])
+                setattr(tokenizer, "additional_special_tokens_ids", [token_id_to_test_setters])
+                self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters])
+                self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters])

docs/transformers/tests/models/camembert/__init__.py ADDED Viewed

File without changes

docs/transformers/tests/models/camembert/test_modeling_tf_camembert.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import unittest
+from transformers import is_tf_available
+from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+if is_tf_available():
+    import numpy as np
+    import tensorflow as tf
+    from transformers import TFCamembertModel
+@require_tf
+@require_sentencepiece
+@require_tokenizers
+class TFCamembertModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_output_embeds_base_model(self):
+        model = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")
+        input_ids = tf.convert_to_tensor(
+            [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]],
+            dtype=tf.int32,
+        )  # J'aime le camembert !"
+        output = model(input_ids)["last_hidden_state"]
+        expected_shape = tf.TensorShape((1, 10, 768))
+        self.assertEqual(output.shape, expected_shape)
+        # compare the actual values for a slice.
+        expected_slice = tf.convert_to_tensor(
+            [[[-0.0254, 0.0235, 0.1027], [0.0606, -0.1811, -0.0418], [-0.1561, -0.1127, 0.2687]]],
+            dtype=tf.float32,
+        )
+        # camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0')
+        # camembert.eval()
+        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
+        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))

docs/transformers/tests/models/camembert/test_tokenization_camembert.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Copyright 2018 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tempfile
+import unittest
+from tempfile import TemporaryDirectory
+from transformers import AddedToken, CamembertTokenizer, CamembertTokenizerFast
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers.utils import is_torch_available
+from ...test_tokenization_common import TokenizerTesterMixin
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+SAMPLE_BPE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model")
+FRAMEWORK = "pt" if is_torch_available() else "tf"
+@require_sentencepiece
+@require_tokenizers
+class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    from_pretrained_id = "almanach/camembert-base"
+    tokenizer_class = CamembertTokenizer
+    rust_tokenizer_class = CamembertTokenizerFast
+    test_rust_tokenizer = True
+    test_sentencepiece = True
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        # We have a SentencePiece fixture for testing
+        tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(cls.tmpdirname)
+    @unittest.skip(
+        "Token maps are not equal because someone set the probability of ('<unk>NOTUSED', -100), so it's never encoded for fast"
+    )
+    def test_special_tokens_map_equal(self):
+        return
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<pad>"
+        token_id = 1  # 1 is the offset id, but in the spm vocab it's 3
+        self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id)
+        self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token)
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+        self.assertEqual(vocab_keys[0], "<s>NOTUSED")
+        self.assertEqual(vocab_keys[1], "<pad>")
+        self.assertEqual(vocab_keys[-1], "<mask>")
+        self.assertEqual(len(vocab_keys), 1_005)
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
+    def test_rust_and_python_bpe_tokenizers(self):
+        tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
+        with TemporaryDirectory() as tmpdirname:
+            tokenizer.save_pretrained(tmpdirname)
+            rust_tokenizer = CamembertTokenizerFast.from_pretrained(tmpdirname)
+        sequence = "I was born in 92000, and this is falsé."
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+        # <unk> tokens are not the same for `rust` than for `slow`.
+        # Because spm gives back raw token instead of `unk` in EncodeAsPieces
+        # tokens = tokenizer.tokenize(sequence)
+        tokens = tokenizer.convert_ids_to_tokens(ids)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            self.skipTest(reason="test_rust_tokenizer is set to False")
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+        sequence = "I was born in 92000, and this is falsé."
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+    @slow
+    def test_tokenizer_integration(self):
+        expected_encoding = {'input_ids': [[5, 54, 7196, 297, 30, 23, 776, 18, 11, 3215, 3705, 8252, 22, 3164, 1181, 2116, 29, 16, 813, 25, 791, 3314, 20, 3446, 38, 27575, 120, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [5, 468, 17, 11, 9088, 20, 1517, 8, 22804, 18818, 10, 38, 629, 607, 607, 142, 19, 7196, 867, 56, 10326, 24, 2267, 20, 416, 5072, 15612, 233, 734, 7, 2399, 27, 16, 3015, 1649, 7, 24, 20, 4338, 2399, 27, 13, 3400, 14, 13, 6189, 8, 930, 9, 6]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
+        # camembert is a french model. So we also use french texts.
+        sequences = [
+            "Le transformeur est un modèle d'apprentissage profond introduit en 2017, "
+            "utilisé principalement dans le domaine du traitement automatique des langues (TAL).",
+            "À l'instar des réseaux de neurones récurrents (RNN), les transformeurs sont conçus "
+            "pour gérer des données séquentielles, telles que le langage naturel, pour des tâches "
+            "telles que la traduction et la synthèse de texte.",
+        ]
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="almanach/camembert-base",
+            revision="3a0641d9a1aeb7e848a74299e7e4c4bca216b4cf",
+            sequences=sequences,
+        )
+    # Overwritten because we have to use from slow (online pretrained is wrong, the tokenizer.json has a whole)
+    def test_added_tokens_serialization(self):
+        self.maxDiff = None
+        # Utility to test the added vocab
+        def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir):
+            tokenizer = tokenizer_class.from_pretrained(temp_dir)
+            self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens)
+            self.assertIn(new_eos, tokenizer.added_tokens_decoder.values())
+            self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos)
+            self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items()))
+            return tokenizer
+        new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True)
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                # Load a slow tokenizer from the hub, init with the new token for fast to also include it
+                tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos)
+                EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
+                with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
+                    self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
+                    self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))
+                with tempfile.TemporaryDirectory() as tmp_dir_2:
+                    tokenizer.save_pretrained(tmp_dir_2)
+                    with self.subTest(
+                        "Hub -> Slow -> Slow: Test saving this slow tokenizer and reloading it in the fast class"
+                    ):
+                        _test_added_vocab_and_eos(
+                            EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_2
+                        )
+                    if self.rust_tokenizer_class is not None:
+                        with self.subTest(
+                            "Hub -> Slow -> Fast: Test saving this slow tokenizer and reloading it in the fast class"
+                        ):
+                            tokenizer_fast = _test_added_vocab_and_eos(
+                                EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_2
+                            )
+                            with tempfile.TemporaryDirectory() as tmp_dir_3:
+                                tokenizer_fast.save_pretrained(tmp_dir_3)
+                                with self.subTest(
+                                    "Hub -> Slow -> Fast -> Fast: Test saving this fast tokenizer and reloading it in the fast class"
+                                ):
+                                    _test_added_vocab_and_eos(
+                                        EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3
+                                    )
+                                with self.subTest(
+                                    "Hub -> Slow -> Fast -> Slow: Test saving this slow tokenizer and reloading it in the slow class"
+                                ):
+                                    _test_added_vocab_and_eos(
+                                        EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3
+                                    )
+                with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
+                    if self.rust_tokenizer_class is not None:
+                        tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos, from_slow=True)
+                        self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
+                        self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
+                        # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
+                        with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
+                            with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
+                                self.assertTrue(
+                                    all(
+                                        item in tokenizer.added_tokens_decoder.items()
+                                        for item in EXPECTED_ADDED_TOKENS_DECODER.items()
+                                    )
+                                )
+                        EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder
+                        with tempfile.TemporaryDirectory() as tmp_dir_4:
+                            tokenizer_fast.save_pretrained(tmp_dir_4)
+                            with self.subTest("Hub -> Fast -> Fast: saving Fast1 locally and loading"):
+                                _test_added_vocab_and_eos(
+                                    EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_4
+                                )
+                            with self.subTest("Hub -> Fast -> Slow: saving Fast1 locally and loading"):
+                                _test_added_vocab_and_eos(
+                                    EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_4
+                                )

docs/transformers/tests/models/canine/__init__.py ADDED Viewed

File without changes

docs/transformers/tests/models/canine/test_modeling_canine.py ADDED Viewed

	@@ -0,0 +1,571 @@

+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch CANINE model."""
+import unittest
+from transformers import CanineConfig, is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, global_rng, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+if is_torch_available():
+    import torch
+    from transformers import (
+        CanineForMultipleChoice,
+        CanineForQuestionAnswering,
+        CanineForSequenceClassification,
+        CanineForTokenClassification,
+        CanineModel,
+    )
+class CanineModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        # let's use a vocab size that's way bigger than BERT's one
+        # NOTE: this is not a model parameter, just an input
+        vocab_size=100000,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        num_hash_buckets=16,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.num_hash_buckets = num_hash_buckets
+        self.scope = scope
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor(input_ids.shape, self.type_vocab_size)
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+        config = self.get_config()
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        return CanineConfig(
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            num_hash_buckets=self.num_hash_buckets,
+        )
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = CanineModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = CanineForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = CanineForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = CanineForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = CanineForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+@require_torch
+class CanineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            CanineModel,
+            CanineForMultipleChoice,
+            CanineForQuestionAnswering,
+            CanineForSequenceClassification,
+            CanineForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": CanineModel,
+            "question-answering": CanineForQuestionAnswering,
+            "text-classification": CanineForSequenceClassification,
+            "token-classification": CanineForTokenClassification,
+            "zero-shot": CanineForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_mismatched_shapes = False
+    test_resize_embeddings = False
+    test_pruning = False
+    def setUp(self):
+        self.model_tester = CanineModelTester(self)
+        # we set has_text_modality to False as the config has no vocab_size attribute
+        self.config_tester = ConfigTester(self, config_class=CanineConfig, has_text_modality=False, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            hidden_states = outputs.hidden_states
+            # expected_num_layers equals num_hidden_layers of the deep encoder + 1, + 2 for the first shallow encoder, + 2
+            # for the final shallow encoder
+            expected_num_layers = self.model_tester.num_hidden_layers + 1 + 2 + 2
+            self.assertEqual(len(hidden_states), expected_num_layers)
+            seq_length = self.model_tester.seq_length
+            for i in range(expected_num_layers):
+                if (i < 2) or ((expected_num_layers - i) < 3):
+                    # the expected length of the hidden_states of the first and final shallow encoders
+                    # is equal to the seq_length
+                    self.assertListEqual(
+                        list(hidden_states[i].shape[-2:]),
+                        [seq_length, self.model_tester.hidden_size],
+                    )
+                else:
+                    # the expected length of the hidden_states of the deep encoder need to be updated
+                    # for CANINE since the seq length is downsampled
+                    self.assertListEqual(
+                        list(hidden_states[i].shape[-2:]),
+                        [seq_length // config.downsampling_rate, self.model_tester.hidden_size],
+                    )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            # we add + 2 due to the 2 shallow encoders
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers + 2)
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            # we add + 2 due to the 2 shallow encoders
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers + 2)
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+            out_len = len(outputs)
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+            self_attentions = outputs.attentions
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers + 2)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (list, tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
+                            ),
+                            msg=(
+                                "Tuple and dict output are not equal. Difference:"
+                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                                f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                                f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                            ),
+                        )
+                recursive_check(tuple_output, dict_output)
+        for model_class in self.all_model_classes:
+            print(model_class)
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(
+                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
+            )
+    def test_headmasking(self):
+        if not self.test_head_masking:
+            self.skipTest(reason="test_head_masking is set to False")
+        global_rng.seed(42)
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        global_rng.seed()
+        inputs_dict["output_attentions"] = True
+        config.output_hidden_states = True
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            # Prepare head_mask
+            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
+            head_mask = torch.ones(
+                self.model_tester.num_hidden_layers,
+                self.model_tester.num_attention_heads,
+                device=torch_device,
+            )
+            head_mask[0, 0] = 0
+            head_mask[-1, :-1] = 0
+            head_mask.requires_grad_(requires_grad=True)
+            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
+            inputs["head_mask"] = head_mask
+            outputs = model(**inputs, return_dict=True)
+            # Test that we can get a gradient back for importance score computation
+            output = sum(t.sum() for t in outputs[0])
+            output = output.sum()
+            output.backward()
+            multihead_outputs = head_mask.grad
+            self.assertIsNotNone(multihead_outputs)
+            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
+            def check_attentions_validity(attentions):
+                # Remove Nan
+                for t in attentions:
+                    self.assertLess(
+                        torch.sum(torch.isnan(t)), t.numel() / 4
+                    )  # Check we don't have more than 25% nans (arbitrary)
+                attentions = [
+                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
+                ]  # remove them (the test is less complete)
+                self.assertAlmostEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[1][..., -1, :, :].flatten().sum().item(), 0.0)
+                self.assertAlmostEqual(attentions[-2][..., -2, :, :].flatten().sum().item(), 0.0)
+                self.assertNotEqual(attentions[-2][..., -1, :, :].flatten().sum().item(), 0.0)
+            check_attentions_validity(outputs.attentions)
+    @unittest.skip(reason="CANINE does not have a get_input_embeddings() method.")
+    def test_inputs_embeds(self):
+        # ViT does not use inputs_embeds
+        pass
+    @unittest.skip(reason="Canine Tower does not use inputs_embeds")
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+    @unittest.skip(reason="CANINE does not have a get_input_embeddings() method.")
+    def test_model_get_set_embeddings(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "google/canine-s"
+        model = CanineModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+@require_torch
+class CanineModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        model = CanineModel.from_pretrained("google/canine-s")
+        # this one corresponds to the first example of the TydiQA dev set (in Swahili)
+        # fmt: off
+        input_ids = [57344, 57349, 85, 107, 117, 98, 119, 97, 32, 119, 97, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 111, 114, 105, 32, 110, 105, 32, 107, 105, 97, 115, 105, 32, 103, 97, 110, 105, 63, 57345, 57350, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 111, 114, 105, 32, 44, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 97, 117, 32, 105, 110, 103, 46, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 40, 112, 105, 97, 58, 32, 84, 111, 108, 105, 109, 97, 110, 32, 97, 117, 32, 82, 105, 103, 105, 108, 32, 75, 101, 110, 116, 97, 117, 114, 117, 115, 41, 32, 110, 105, 32, 110, 121, 111, 116, 97, 32, 105, 110, 97, 121, 111, 110, 103, 39, 97, 97, 32, 115, 97, 110, 97, 32, 107, 97, 116, 105, 107, 97, 32, 97, 110, 103, 97, 32, 121, 97, 32, 107, 117, 115, 105, 110, 105, 32, 107, 119, 101, 110, 121, 101, 32, 107, 117, 110, 100, 105, 110, 121, 111, 116, 97, 32, 121, 97, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 40, 112, 105, 97, 58, 32, 105, 110, 103, 46, 32, 67, 101, 110, 116, 97, 117, 114, 117, 115, 41, 46, 32, 78, 105, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 107, 117, 110, 103, 97, 97, 32, 115, 97, 110, 97, 32, 121, 97, 32, 110, 110, 101, 32, 97, 110, 103, 97, 110, 105, 32, 108, 97, 107, 105, 110, 105, 32, 104, 97, 105, 111, 110, 101, 107, 97, 110, 105, 32, 107, 119, 101, 110, 121, 101, 32, 110, 117, 115, 117, 100, 117, 110, 105, 97, 32, 121, 97, 32, 107, 97, 115, 107, 97, 122, 105, 110, 105, 46, 32, 57351, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 110, 105, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 112, 101, 107, 101, 101, 32, 107, 119, 97, 32, 115, 97, 98, 97, 98, 117, 32, 110, 105, 32, 110, 121, 111, 116, 97, 32, 121, 101, 116, 117, 32, 106, 105, 114, 97, 110, 105, 32, 107, 97, 116, 105, 107, 97, 32, 97, 110, 103, 97, 32, 105, 110, 97, 32, 117, 109, 98, 97, 108, 105, 32, 119, 97, 32, 109, 105, 97, 107, 97, 32, 121, 97, 32, 110, 117, 114, 117, 32, 52, 46, 50, 46, 32, 73, 110, 97, 111, 110, 101, 107, 97, 110, 97, 32, 97, 110, 103, 97, 110, 105, 32, 107, 97, 114, 105, 98, 117, 32, 110, 97, 32, 107, 117, 110, 100, 105, 110, 121, 111, 116, 97, 32, 121, 97, 32, 83, 97, 108, 105, 98, 117, 32, 40, 67, 114, 117, 120, 41, 46, 32, 57352, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 40, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 41, 32, 105, 110, 97, 111, 110, 101, 107, 97, 110, 97, 32, 107, 97, 109, 97, 32, 110, 121, 111, 116, 97, 32, 109, 111, 106, 97, 32, 108, 97, 107, 105, 110, 105, 32, 107, 119, 97, 32, 100, 97, 114, 117, 98, 105, 110, 105, 32, 107, 117, 98, 119, 97, 32, 105, 110, 97, 111, 110, 101, 107, 97, 110, 97, 32, 107, 117, 119, 97, 32, 109, 102, 117, 109, 111, 32, 119, 97, 32, 110, 121, 111, 116, 97, 32, 116, 97, 116, 117, 32, 122, 105, 110, 97, 122, 111, 107, 97, 97, 32, 107, 97, 114, 105, 98, 117, 32, 110, 97, 32, 107, 117, 115, 104, 105, 107, 97, 109, 97, 110, 97, 32, 107, 97, 116, 105, 32, 121, 97, 111, 46, 32, 78, 121, 111, 116, 97, 32, 109, 97, 112, 97, 99, 104, 97, 32, 122, 97, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 65, 32, 110, 97, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 66, 32, 122, 105, 107, 111, 32, 109, 105, 97, 107, 97, 32, 121, 97, 32, 110, 117, 114, 117, 32, 52, 46, 51, 54, 32, 107, 117, 116, 111, 107, 97, 32, 107, 119, 101, 116, 117, 32, 110, 97, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 116, 97, 116, 117, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 67, 32, 97, 117, 32, 80, 114, 111, 120, 105, 109, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 105, 110, 97, 32, 117, 109, 98, 97, 108, 105, 32, 119, 97, 32, 109, 105, 97, 107, 97, 32, 121, 97, 32, 110, 117, 114, 117, 32, 52, 46, 50, 50, 46, 32, 57353, 32, 80, 114, 111, 120, 105, 109, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 40, 121, 97, 97, 110, 105, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 105, 108, 105, 121, 111, 32, 107, 97, 114, 105, 98, 117, 32, 122, 97, 105, 100, 105, 32, 110, 97, 115, 105, 41, 32, 105, 109, 101, 103, 117, 110, 100, 117, 108, 105, 119, 97, 32, 107, 117, 119, 97, 32, 110, 97, 32, 115, 97, 121, 97, 114, 105, 32, 109, 111, 106, 97, 46, 32, 86, 105, 112, 105, 109, 111, 32, 118, 105, 110, 97, 118, 121, 111, 112, 97, 116, 105, 107, 97, 110, 97, 32, 104, 97, 100, 105, 32, 115, 97, 115, 97, 32, 122, 105, 110, 97, 111, 110, 121, 101, 115, 104, 97, 32, 117, 119, 101, 122, 101, 107, 97, 110, 111, 32, 109, 107, 117, 98, 119, 97, 32, 121, 97, 32, 107, 119, 97, 109, 98, 97, 32, 115, 97, 121, 97, 114, 105, 32, 104, 105, 105, 32, 110, 105, 32, 121, 97, 32, 109, 119, 97, 109, 98, 97, 32, 40, 107, 97, 109, 97, 32, 100, 117, 110, 105, 97, 32, 121, 101, 116, 117, 44, 32, 77, 105, 114, 105, 104, 105, 32, 97, 117, 32, 90, 117, 104, 117, 114, 97, 41, 32, 110, 97, 32, 105, 110, 97, 119, 101, 122, 97, 32, 107, 117, 119, 97, 32, 110, 97, 32, 97, 110, 103, 97, 104, 101, 119, 97, 44, 32, 116, 101, 110, 97, 32, 107, 97, 116, 105, 107, 97, 32, 117, 112, 101, 111, 32, 119, 97, 32, 106, 111, 116, 111, 32, 117, 110, 97, 111, 114, 117, 104, 117, 115, 117, 32, 107, 117, 119, 101, 112, 111, 32, 107, 119, 97, 32, 117, 104, 97, 105, 46, 32, 91, 49, 93, 57345, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        attention_mask = [1 if x != 0 else 0 for x in input_ids]
+        token_type_ids = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        # fmt: on
+        input_ids = torch.tensor([input_ids])
+        attention_mask = torch.tensor([attention_mask])
+        token_type_ids = torch.tensor([token_type_ids])
+        outputs = model(input_ids, attention_mask, token_type_ids)
+        # verify sequence output
+        expected_shape = torch.Size((1, 2048, 768))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [
+                [-0.161433131, 0.395568609, 0.0407391489],
+                [-0.108025983, 0.362060368, -0.544592619],
+                [-0.141537309, 0.180541009, 0.076907],
+            ]
+        )
+        torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-2, atol=1e-2)
+        # verify pooled output
+        expected_shape = torch.Size((1, 768))
+        self.assertEqual(outputs.pooler_output.shape, expected_shape)
+        expected_slice = torch.tensor([-0.884311497, -0.529064834, 0.723164916])
+        torch.testing.assert_close(outputs.pooler_output[0, :3], expected_slice, rtol=1e-2, atol=1e-2)

docs/transformers/tests/models/canine/test_tokenization_canine.py ADDED Viewed

	@@ -0,0 +1,339 @@

+# Copyright 2021 Google AI and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from functools import lru_cache
+from transformers import BatchEncoding, CanineTokenizer
+from transformers.testing_utils import require_tokenizers, require_torch
+from transformers.tokenization_utils import AddedToken
+from transformers.utils import cached_property
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
+class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    from_pretrained_id = "nielsr/canine-s"
+    tokenizer_class = CanineTokenizer
+    test_rust_tokenizer = False
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        tokenizer = CanineTokenizer()
+        tokenizer.save_pretrained(cls.tmpdirname)
+    @cached_property
+    def canine_tokenizer(self):
+        return CanineTokenizer.from_pretrained("google/canine-s")
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> CanineTokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        tokenizer = cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+        tokenizer._unicode_vocab_size = 1024
+        return tokenizer
+    @require_torch
+    def test_prepare_batch_integration(self):
+        tokenizer = self.canine_tokenizer
+        src_text = ["Life is like a box of chocolates.", "You never know what you're gonna get."]
+        expected_src_tokens = [57344, 76, 105, 102, 101, 32, 105, 115, 32, 108, 105, 107, 101, 32, 97, 32, 98, 111, 120, 32, 111, 102, 32, 99, 104, 111, 99, 111, 108, 97, 116, 101, 115, 46, 57345, 0, 0, 0, 0]  # fmt: skip
+        batch = tokenizer(src_text, padding=True, return_tensors="pt")
+        self.assertIsInstance(batch, BatchEncoding)
+        result = list(batch.input_ids.numpy()[0])
+        self.assertListEqual(expected_src_tokens, result)
+        self.assertEqual((2, 39), batch.input_ids.shape)
+        self.assertEqual((2, 39), batch.attention_mask.shape)
+    @require_torch
+    def test_encoding_keys(self):
+        tokenizer = self.canine_tokenizer
+        src_text = ["Once there was a man.", "He wrote a test in HuggingFace Transformers."]
+        batch = tokenizer(src_text, padding=True, return_tensors="pt")
+        # check if input_ids, attention_mask and token_type_ids are returned
+        self.assertIn("input_ids", batch)
+        self.assertIn("attention_mask", batch)
+        self.assertIn("token_type_ids", batch)
+    @require_torch
+    def test_max_length_integration(self):
+        tokenizer = self.canine_tokenizer
+        tgt_text = [
+            "What's the weater?",
+            "It's about 25 degrees.",
+        ]
+        targets = tokenizer(
+            text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        self.assertEqual(32, targets["input_ids"].shape[1])
+    # cannot use default save_and_load_tokenizer test method because tokenizer has no vocab
+    def test_save_and_load_tokenizer(self):
+        # safety check on max_len default value so we are sure the test works
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                self.assertNotEqual(tokenizer.model_max_length, 42)
+        # Now let's start the test
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                tmpdirname = tempfile.mkdtemp()
+                sample_text = " He is very happy, UNwant\u00e9d,running"
+                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+                tokenizer.save_pretrained(tmpdirname)
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+                self.assertListEqual(before_tokens, after_tokens)
+                shutil.rmtree(tmpdirname)
+        tokenizers = self.get_tokenizers(model_max_length=42)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Isolate this from the other tests because we save additional tokens/etc
+                tmpdirname = tempfile.mkdtemp()
+                sample_text = " He is very happy, UNwant\u00e9d,running"
+                additional_special_tokens = tokenizer.additional_special_tokens
+                # We can add a new special token for Canine as follows:
+                new_additional_special_token = chr(0xE007)
+                additional_special_tokens.append(new_additional_special_token)
+                tokenizer.add_special_tokens(
+                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+                )
+                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+                tokenizer.save_pretrained(tmpdirname)
+                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+                self.assertListEqual(before_tokens, after_tokens)
+                self.assertIn(new_additional_special_token, after_tokenizer.additional_special_tokens)
+                self.assertEqual(after_tokenizer.model_max_length, 42)
+                tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
+                self.assertEqual(tokenizer.model_max_length, 43)
+                shutil.rmtree(tmpdirname)
+    def test_add_special_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                input_text, ids = self.get_clean_sequence(tokenizer)
+                # a special token for Canine can be defined as follows:
+                SPECIAL_TOKEN = 0xE005
+                special_token = chr(SPECIAL_TOKEN)
+                tokenizer.add_special_tokens({"cls_token": special_token})
+                encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
+                self.assertEqual(len(encoded_special_token), 1)
+                text = tokenizer.decode(ids + encoded_special_token, clean_up_tokenization_spaces=False)
+                encoded = tokenizer.encode(text, add_special_tokens=False)
+                input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
+                special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
+                self.assertEqual(encoded, input_encoded + special_token_id)
+                decoded = tokenizer.decode(encoded, skip_special_tokens=True)
+                self.assertTrue(special_token not in decoded)
+    def test_tokenize_special_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=True)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                SPECIAL_TOKEN_1 = chr(0xE005)
+                SPECIAL_TOKEN_2 = chr(0xE006)
+                tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
+                tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
+                token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
+                token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
+                self.assertEqual(len(token_1), 1)
+                self.assertEqual(len(token_2), 1)
+                self.assertEqual(token_1[0], SPECIAL_TOKEN_1)
+                self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
+    @require_tokenizers
+    def test_added_token_serializable(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # a special token for Canine can be defined as follows:
+                NEW_TOKEN = 0xE006
+                new_token = chr(NEW_TOKEN)
+                new_token = AddedToken(new_token, lstrip=True)
+                tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
+                with tempfile.TemporaryDirectory() as tmp_dir_name:
+                    tokenizer.save_pretrained(tmp_dir_name)
+                    tokenizer.from_pretrained(tmp_dir_name)
+    def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
+        tokenizer_list = []
+        if self.test_slow_tokenizer:
+            tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
+        if self.test_rust_tokenizer:
+            tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
+        for tokenizer_class, tokenizer_utils in tokenizer_list:
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tokenizer_utils.save_pretrained(tmp_dir)
+                with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
+                    special_tokens_map = json.load(json_file)
+                with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
+                    tokenizer_config = json.load(json_file)
+                # a special token for Canine can be defined as follows:
+                NEW_TOKEN = 0xE006
+                new_token_1 = chr(NEW_TOKEN)
+                special_tokens_map["additional_special_tokens"] = [new_token_1]
+                tokenizer_config["additional_special_tokens"] = [new_token_1]
+                with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
+                    json.dump(special_tokens_map, outfile)
+                with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
+                    json.dump(tokenizer_config, outfile)
+                # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
+                # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
+                # "special_tokens_map.json" files
+                tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir, extra_ids=0)
+                self.assertIn(new_token_1, tokenizer_without_change_in_init.additional_special_tokens)
+                # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
+                self.assertEqual(
+                    [new_token_1],
+                    tokenizer_without_change_in_init.convert_ids_to_tokens(
+                        tokenizer_without_change_in_init.convert_tokens_to_ids([new_token_1])
+                    ),
+                )
+                NEW_TOKEN = 0xE007
+                new_token_2 = chr(NEW_TOKEN)
+                # Now we test that we can change the value of additional_special_tokens in the from_pretrained
+                new_added_tokens = [AddedToken(new_token_2, lstrip=True)]
+                tokenizer = tokenizer_class.from_pretrained(
+                    tmp_dir, additional_special_tokens=new_added_tokens, extra_ids=0
+                )
+                self.assertIn(new_token_2, tokenizer.additional_special_tokens)
+                # self.assertIn(new_token_2,tokenizer.get_vocab()) # ByT5Tokenization no vocab
+                self.assertEqual(
+                    [new_token_2], tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_token_2]))
+                )
+    @require_tokenizers
+    def test_encode_decode_with_spaces(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                input = "hello world"
+                if self.space_between_special_tokens:
+                    output = "[CLS] hello world [SEP]"
+                else:
+                    output = input
+                encoded = tokenizer.encode(input, add_special_tokens=False)
+                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
+                self.assertIn(decoded, [output, output.lower()])
+    # cannot use default `test_tokenizers_common_ids_setters` method because tokenizer has no vocab
+    def test_tokenizers_common_ids_setters(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                attributes_list = [
+                    "bos_token",
+                    "eos_token",
+                    "unk_token",
+                    "sep_token",
+                    "pad_token",
+                    "cls_token",
+                    "mask_token",
+                ]
+        token_to_test_setters = "a"
+        token_id_to_test_setters = ord(token_to_test_setters)
+        for attr in attributes_list:
+            setattr(tokenizer, attr + "_id", None)
+            self.assertEqual(getattr(tokenizer, attr), None)
+            self.assertEqual(getattr(tokenizer, attr + "_id"), None)
+            setattr(tokenizer, attr + "_id", token_id_to_test_setters)
+            self.assertEqual(getattr(tokenizer, attr), token_to_test_setters)
+            self.assertEqual(getattr(tokenizer, attr + "_id"), token_id_to_test_setters)
+        setattr(tokenizer, "additional_special_tokens_ids", [])
+        self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [])
+        self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [])
+        additional_special_token_id = 0xE006
+        additional_special_token = chr(additional_special_token_id)
+        setattr(tokenizer, "additional_special_tokens_ids", [additional_special_token_id])
+        self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [additional_special_token])
+        self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [additional_special_token_id])
+    @unittest.skip(reason="tokenizer has a fixed vocab_size (namely all possible unicode code points)")
+    def test_add_tokens_tokenizer(self):
+        pass
+    # CanineTokenizer does not support do_lower_case = True, as each character has its own Unicode code point
+    # ("b" and "B" for example have different Unicode code points)
+    @unittest.skip(reason="CanineTokenizer does not support do_lower_case = True")
+    def test_added_tokens_do_lower_case(self):
+        pass
+    @unittest.skip(reason="CanineModel does not support the get_input_embeddings nor the get_vocab method")
+    def test_np_encode_plus_sent_to_model(self):
+        pass
+    @unittest.skip(reason="CanineModel does not support the get_input_embeddings nor the get_vocab method")
+    def test_torch_encode_plus_sent_to_model(self):
+        pass
+    @unittest.skip(reason="CanineTokenizer does not have vocabulary")
+    def test_get_vocab(self):
+        pass
+    @unittest.skip(reason="inputs cannot be pretokenized since ids depend on whole input string")
+    def test_pretokenized_inputs(self):
+        pass
+    @unittest.skip(reason="CanineTokenizer does not have vocabulary")
+    def test_conversion_reversible(self):
+        pass

docs/transformers/tests/models/chameleon/__init__.py ADDED Viewed

File without changes

docs/transformers/tests/models/chameleon/test_image_processing_chameleon.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+if is_torch_available():
+    import torch
+if is_vision_available():
+    from PIL import Image
+    from transformers import ChameleonImageProcessor
+class ChameleonImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=200,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[1.0, 1.0, 1.0],
+        image_std=[1.0, 1.0, 1.0],
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"shortest_edge": 18}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+@require_torch
+@require_vision
+class ChameleonImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ChameleonImageProcessor if is_vision_available() else None
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->Chameleon
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = ChameleonImageProcessingTester(self)
+    @property
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "center_crop"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"shortest_edge": 18})
+        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+        self.assertEqual(image_processor.size, {"shortest_edge": 42})
+        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, Image.Image)
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = (1, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = (7, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = (1, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = (7, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, torch.Tensor)
+        # Test not batched input
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = (1, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+        # Test batched
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = (7, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+    def test_nested_input(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+        # Test batched as a list of images
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = (7, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+        # Test batched as a nested list of images, where each sublist is one batch
+        image_inputs_nested = [image_inputs[:3], image_inputs[3:]]
+        encoded_images_nested = image_processing(image_inputs_nested, return_tensors="pt").pixel_values
+        expected_output_image_shape = (7, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
+        # Image processor should return same pixel values, independently of input format
+        self.assertTrue((encoded_images_nested == encoded_images).all())

docs/transformers/tests/models/chameleon/test_modeling_chameleon.py ADDED Viewed

	@@ -0,0 +1,481 @@

+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch chameleon model."""
+import copy
+import unittest
+import requests
+from parameterized import parameterized
+from transformers import ChameleonConfig, is_torch_available, is_vision_available, set_seed
+from transformers.testing_utils import (
+    require_bitsandbytes,
+    require_read_token,
+    require_torch,
+    slow,
+    torch_device,
+)
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+if is_vision_available():
+    from PIL import Image
+if is_torch_available():
+    import torch
+    from transformers import (
+        ChameleonForConditionalGeneration,
+        ChameleonModel,
+        ChameleonProcessor,
+    )
+class ChameleonModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=35,
+        is_training=False,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        image_token_id=4,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        num_key_value_heads=2,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        vq_num_embeds=5,
+        vq_embed_dim=5,
+        vq_channel_multiplier=[1, 4],
+        vq_img_token_start_id=10,  # has to be less than vocab size when added with vq_num_embeds
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.image_token_id = image_token_id
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+        self.vq_num_embeds = vq_num_embeds
+        self.vq_embed_dim = vq_embed_dim
+        self.vq_channel_multiplier = vq_channel_multiplier
+        self.vq_img_token_start_id = vq_img_token_start_id
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+        config = self.get_config()
+        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        # create dummy vocab map for image2bpe mapping if it needs remapping
+        # we assume that vocab size is big enough to account for image tokens somewhere in the beginning
+        # same way as in real ckpt, when img tokens are in first half of embeds
+        # we will need "vq_num_embeds" amount of tokens
+        vocab_map = {i: chr(i) for i in range(self.vocab_size)}
+        vocab_map[self.image_token_id] = "<image>"
+        start = self.vq_img_token_start_id
+        end = self.vq_img_token_start_id + self.vq_num_embeds
+        for i in range(start, end):
+            image_token_infix = "".join(chr(ord("A") + int(c)) for c in str(i))
+            # dummy str for each image token, anything starting with IMGIMG
+            vocab_map[i] = f"IMGIMG{image_token_infix}Z"
+        return ChameleonConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+            vocabulary_map={v: k for k, v in vocab_map.items()},
+            vq_config=self.get_vq_config(),
+        )
+    def get_vq_config(self):
+        return {
+            "embed_dim": self.vq_embed_dim,
+            "num_embeddings": self.vq_num_embeds,
+            "latent_channels": self.vq_embed_dim,
+            "in_channels": 3,
+            "base_channels": 32,  # we have a GroupNorm of 32 groups, so can't do less
+            "channel_multiplier": self.vq_channel_multiplier,
+        }
+    def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        model = ChameleonModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+@require_torch
+class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (ChameleonModel, ChameleonForConditionalGeneration) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": ChameleonModel,
+            "text-generation": ChameleonForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+    fx_compatible = False
+    def setUp(self):
+        self.model_tester = ChameleonModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ChameleonConfig, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    @parameterized.expand([("linear",), ("dynamic",)])
+    def test_model_rope_scaling(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = ChameleonModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = ChameleonModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5)
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+    @unittest.skip("Chameleon forces some token ids to be -inf!")
+    def test_batching_equivalence(self):
+        pass
+    @unittest.skip("Chameleon VQ model cannot be squishes more due to hardcoded layer params in model code")
+    def test_model_is_small(self):
+        pass
+class ChameleonVision2SeqModelTester(ChameleonModelTester):
+    def __init__(self, parent, image_size=10, **kwargs):
+        super().__init__(parent, **kwargs)
+        self.image_size = image_size
+        self.image_seq_length = 25
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids[input_ids == self.image_token_id] = self.pad_token_id
+        input_ids[:, : self.image_seq_length] = self.image_token_id
+        attention_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
+        pixel_values = floats_tensor([self.batch_size, 3, self.image_size, self.image_size])
+        config = self.get_config()
+        return config, input_ids, attention_mask, pixel_values
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "pixel_values": pixel_values}
+        return config, inputs_dict
+@require_torch
+class ChameleonVision2SeqModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (ChameleonModel, ChameleonForConditionalGeneration) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "image-text-to-text": ChameleonForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+    fx_compatible = False
+    def setUp(self):
+        self.model_tester = ChameleonVision2SeqModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ChameleonConfig, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    @unittest.skip("Chameleon forces some token ids to be -inf!")
+    def test_batching_equivalence(self):
+        pass
+    @unittest.skip("Chameleon cannot do offload because it uses `self.linear.weight` in forward")
+    def test_cpu_offload(self):
+        pass
+    @unittest.skip("Chameleon cannot do offload because it uses `self.linear.weight` in forward")
+    def test_disk_offload_bin(self):
+        pass
+    @unittest.skip("Chameleon cannot do offload because it uses `self.linear.weight` in forward")
+    def test_disk_offload_safetensors(self):
+        pass
+    @unittest.skip("Chameleon VQ model cannot be squishes more due to hardcoded layer params in model code")
+    def test_model_is_small(self):
+        pass
+    def test_mismatching_num_image_tokens(self):
+        """
+        Tests that VLMs through an error with explicit message saying what is wrong
+        when number of images don't match number of image tokens in the text.
+        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            curr_input_dict = copy.deepcopy(input_dict)  # the below tests modify dict in-place
+            _ = model(**curr_input_dict)  # successful forward with no modifications
+            # remove one image but leave the image token in text
+            curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**curr_input_dict)
+            # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
+            input_ids = curr_input_dict["input_ids"][:1]
+            pixel_values = curr_input_dict["pixel_values"][:1]
+            input_ids = torch.cat([input_ids, input_ids], dim=0)
+            # one image and two image tokens raise an error
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, pixel_values=pixel_values)
+            # two images and two image tokens don't raise an error
+            pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+            _ = model(input_ids=input_ids, pixel_values=pixel_values)
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+            with torch.no_grad():
+                model(**inputs)
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            torch.testing.assert_close(out_embeds, out_ids)
+@require_torch
+class ChameleonIntegrationTest(unittest.TestCase):
+    @slow
+    @require_bitsandbytes
+    @require_read_token
+    def test_model_7b(self):
+        model = ChameleonForConditionalGeneration.from_pretrained(
+            "facebook/chameleon-7b", load_in_4bit=True, device_map="auto"
+        )
+        processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
+        image = Image.open(
+            requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
+        )
+        prompt = "<image>Describe what do you see here and tell me about the history behind it?"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.float16)
+        # greedy generation outputs
+        EXPECTED_TEXT_COMPLETION = ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in']  # fmt: skip
+        generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+    @slow
+    @require_bitsandbytes
+    @require_read_token
+    def test_model_7b_batched(self):
+        model = ChameleonForConditionalGeneration.from_pretrained(
+            "facebook/chameleon-7b", load_in_4bit=True, device_map="auto"
+        )
+        processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
+        image = Image.open(
+            requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
+        )
+        image_2 = Image.open(
+            requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw
+        )
+        prompts = [
+            "<image>Describe what do you see here and tell me about the history behind it?",
+            "What constellation is this image showing?<image>",
+        ]
+        inputs = processor(images=[image, image_2], text=prompts, padding=True, return_tensors="pt").to(
+            model.device, torch.float16
+        )
+        # greedy generation outputs
+        EXPECTED_TEXT_COMPLETION = [
+            'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in',
+            'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.'
+            ]  # fmt: skip
+        generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+    @slow
+    @require_bitsandbytes
+    @require_read_token
+    def test_model_7b_multi_image(self):
+        model = ChameleonForConditionalGeneration.from_pretrained(
+            "facebook/chameleon-7b", load_in_4bit=True, device_map="auto"
+        )
+        processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
+        image = Image.open(
+            requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
+        )
+        image_2 = Image.open(
+            requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw
+        )
+        prompt = "What do these two images have in common?<image><image>"
+        inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.float16)
+        # greedy generation outputs
+        EXPECTED_TEXT_COMPLETION = ['What do these two images have in common?The two images show a connection between the night sky and the internet. The first image shows a starry night sky, with the stars arranged in a pattern that resembles the structure of the internet. The']  # fmt: skip
+        generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)

docs/transformers/tests/models/chameleon/test_processor_chameleon.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch chameleon model."""
+import tempfile
+import unittest
+from transformers import ChameleonProcessor, LlamaTokenizer
+from transformers.testing_utils import get_tests_dir
+from transformers.utils import is_vision_available
+from ...test_processing_common import ProcessorTesterMixin
+if is_vision_available():
+    from transformers import ChameleonImageProcessor
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = ChameleonProcessor
+    @classmethod
+    def setUpClass(cls):
+        cls.tmpdirname = tempfile.mkdtemp()
+        image_processor = ChameleonImageProcessor()
+        tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB)
+        tokenizer.pad_token_id = 0
+        tokenizer.sep_token_id = 1
+        tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
+        processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2)
+        processor.save_pretrained(cls.tmpdirname)
+        cls.image_token = processor.image_token
+    def test_special_mm_token_truncation(self):
+        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
+        processor = self.get_processor()
+        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
+        image_input = self.prepare_image_inputs(batch_size=2)
+        _ = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            truncation=None,
+            padding=True,
+        )
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                images=image_input,
+                return_tensors="pt",
+                truncation=True,
+                padding=True,
+                max_length=20,
+            )
+    @staticmethod
+    def prepare_processor_dict():
+        return {"image_seq_length": 2}  # fmt: skip

docs/transformers/tests/models/chinese_clip/__init__.py ADDED Viewed

File without changes

docs/transformers/tests/models/chinese_clip/test_image_processing_chinese_clip.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torchvision_available, is_vision_available
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+if is_vision_available():
+    from transformers import ChineseCLIPImageProcessor
+    if is_torchvision_available():
+        from transformers import ChineseCLIPImageProcessorFast
+class ChineseCLIPImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"height": 224, "width": 224}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+    def expected_output_image_shape(self, images):
+        return 3, self.crop_size["height"], self.crop_size["width"]
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+@require_torch
+@require_vision
+class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
+    fast_image_processing_class = ChineseCLIPImageProcessorFast if is_torchvision_available() else None
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = ChineseCLIPImageProcessingTester(self, do_center_crop=True)
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "size"))
+            self.assertTrue(hasattr(image_processing, "do_center_crop"))
+            self.assertTrue(hasattr(image_processing, "center_crop"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+    def test_image_processor_from_dict_with_kwargs(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"height": 224, "width": 224})
+            self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+            image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+            self.assertEqual(image_processor.size, {"shortest_edge": 42})
+            self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+    @unittest.skip(
+        reason="ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet"
+    )  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass
+@require_torch
+@require_vision
+class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
+    fast_image_processing_class = ChineseCLIPImageProcessorFast if is_torchvision_available() else None
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = ChineseCLIPImageProcessingTester(self, num_channels=4, do_center_crop=True)
+        self.expected_encoded_image_num_channels = 3
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "size"))
+            self.assertTrue(hasattr(image_processing, "do_center_crop"))
+            self.assertTrue(hasattr(image_processing, "center_crop"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+    @unittest.skip(reason="ChineseCLIPImageProcessor does not support 4 channels yet")  # FIXME Amy
+    def test_call_numpy(self):
+        return super().test_call_numpy()
+    @unittest.skip(reason="ChineseCLIPImageProcessor does not support 4 channels yet")  # FIXME Amy
+    def test_call_pytorch(self):
+        return super().test_call_torch()
+    @unittest.skip(
+        reason="ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet"
+    )  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass

docs/transformers/tests/models/chinese_clip/test_modeling_chinese_clip.py ADDED Viewed

	@@ -0,0 +1,762 @@

+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Chinese-CLIP model."""
+import inspect
+import os
+import tempfile
+import unittest
+import numpy as np
+import requests
+from transformers import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+if is_torch_available():
+    import torch
+    from torch import nn
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        ChineseCLIPModel,
+        ChineseCLIPTextModel,
+        ChineseCLIPVisionModel,
+    )
+if is_vision_available():
+    from PIL import Image
+    from transformers import ChineseCLIPProcessor
+class ChineseCLIPTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+        config = self.get_config()
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def get_config(self):
+        """
+        Returns a tiny configuration by default.
+        """
+        return ChineseCLIPTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = ChineseCLIPTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = ChineseCLIPTextModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+class ChineseCLIPVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+        return config, pixel_values
+    def get_config(self):
+        return ChineseCLIPVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+    def create_and_check_model(self, config, pixel_values):
+        model = ChineseCLIPVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+@require_torch
+class ChineseCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (ChineseCLIPTextModel,) if is_torch_available() else ()
+    fx_compatible = False
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+        if return_labels:
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["next_sentence_label"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+    def setUp(self):
+        self.model_tester = ChineseCLIPTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ChineseCLIPTextConfig, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+    def test_model_as_decoder_with_default_input_mask(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+        input_mask = None
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
+        model = ChineseCLIPTextModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+    @unittest.skip
+    def test_training(self):
+        pass
+    @unittest.skip
+    def test_training_gradient_checkpointing(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+@require_torch
+class ChineseCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CHINESE_CLIP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+    all_model_classes = (ChineseCLIPVisionModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    def setUp(self):
+        self.model_tester = ChineseCLIPVisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=ChineseCLIPVisionConfig, has_text_modality=False, hidden_size=37
+        )
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    @unittest.skip(reason="CHINESE_CLIP does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    @unittest.skip
+    def test_training(self):
+        pass
+    @unittest.skip
+    def test_training_gradient_checkpointing(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
+        model = ChineseCLIPVisionModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+class ChineseCLIPModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+        self.parent = parent
+        self.text_model_tester = ChineseCLIPTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = ChineseCLIPVisionModelTester(parent, **vision_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.is_training = is_training
+    def prepare_config_and_inputs(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            _,
+            __,
+            ___,
+        ) = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        config = self.get_config()
+        return config, input_ids, token_type_ids, attention_mask, pixel_values
+    def get_config(self):
+        return ChineseCLIPConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+    def create_and_check_model(self, config, input_ids, token_type_ids, attention_mask, pixel_values):
+        model = ChineseCLIPModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask, token_type_ids)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, token_type_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+@require_torch
+class ChineseCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (ChineseCLIPModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": ChineseCLIPModel} if is_torch_available() else {}
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    def setUp(self):
+        text_kwargs = {"use_labels": False, "batch_size": 12}
+        vision_kwargs = {"batch_size": 12}
+        self.model_tester = ChineseCLIPModelTester(self, text_kwargs, vision_kwargs)
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+    @unittest.skip(reason="ChineseCLIPModel does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+    # override as the `logit_scale` parameter initialization is different for CHINESE_CLIP
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        configs_no_init = _config_zero_init(config)
+        for sub_config_key in ("vision_config", "text_config"):
+            sub_config = getattr(configs_no_init, sub_config_key, {})
+            setattr(configs_no_init, sub_config_key, _config_zero_init(sub_config))
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initialized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            self.skipTest(reason="test_torchscript is set to False")
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # CHINESE_CLIP needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+            model.to(torch_device)
+            model.eval()
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+            self.assertTrue(models_equal)
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
+        model = ChineseCLIPModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+# We will verify our results on an image of Pikachu
+def prepare_img():
+    url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+@require_vision
+@require_torch
+class ChineseCLIPModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
+        model = ChineseCLIPModel.from_pretrained(model_name).to(torch_device)
+        processor = ChineseCLIPProcessor.from_pretrained(model_name)
+        image = prepare_img()
+        inputs = processor(
+            text=["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], images=image, padding=True, return_tensors="pt"
+        ).to(torch_device)
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+        probs = outputs.logits_per_image.softmax(dim=1)
+        expected_probs = torch.tensor([[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]], device=torch_device)
+        torch.testing.assert_close(probs, expected_probs, rtol=5e-3, atol=5e-3)
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        # ViT models have an `interpolate_pos_encoding` argument in their forward method,
+        # allowing to interpolate the pre-trained position embeddings in order to use
+        # the model on higher resolutions. The DINO model by Facebook AI leverages this
+        # to visualize self-attention on higher resolution images.
+        model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
+        model = ChineseCLIPModel.from_pretrained(model_name).to(torch_device)
+        image_processor = ChineseCLIPProcessor.from_pretrained(
+            model_name, size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180}
+        )
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        inputs = image_processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
+        # interpolate_pos_encodiung false should return value error
+        with self.assertRaises(ValueError, msg="doesn't match model"):
+            with torch.no_grad():
+                model(**inputs, interpolate_pos_encoding=False)
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs, interpolate_pos_encoding=True)
+        # verify the logits
+        expected_shape = torch.Size((1, 122, 768))
+        self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[-0.3990, 0.2983, -0.1239], [-0.1452, -0.2759, 0.0403], [-0.3149, -0.4763, 0.8555]]
+        ).to(torch_device)
+        torch.testing.assert_close(
+            outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4
+        )

docs/transformers/tests/models/chinese_clip/test_processor_chinese_clip.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import shutil
+import tempfile
+import unittest
+import pytest
+from transformers import BertTokenizer, BertTokenizerFast
+from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_vision
+from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
+from ...test_processing_common import ProcessorTesterMixin
+if is_vision_available():
+    from transformers import ChineseCLIPImageProcessor, ChineseCLIPProcessor
+@require_vision
+class ChineseCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = ChineseCLIPProcessor
+    @classmethod
+    def setUpClass(cls):
+        cls.tmpdirname = tempfile.mkdtemp()
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "的",
+            "价",
+            "格",
+            "是",
+            "15",
+            "便",
+            "alex",
+            "##andra",
+            "，",
+            "。",
+            "-",
+            "t",
+            "shirt",
+        ]
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+        image_processor_map = {
+            "do_resize": True,
+            "size": {"height": 224, "width": 224},
+            "do_center_crop": True,
+            "crop_size": {"height": 18, "width": 18},
+            "do_normalize": True,
+            "image_mean": [0.48145466, 0.4578275, 0.40821073],
+            "image_std": [0.26862954, 0.26130258, 0.27577711],
+            "do_convert_rgb": True,
+        }
+        cls.image_processor_file = os.path.join(cls.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        with open(cls.image_processor_file, "w", encoding="utf-8") as fp:
+            json.dump(image_processor_map, fp)
+        tokenizer = cls.get_tokenizer()
+        image_processor = cls.get_image_processor()
+        processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor.save_pretrained(cls.tmpdirname)
+    @classmethod
+    def get_tokenizer(cls, **kwargs):
+        return BertTokenizer.from_pretrained(cls.tmpdirname, **kwargs)
+    @classmethod
+    def get_rust_tokenizer(cls, **kwargs):
+        return BertTokenizerFast.from_pretrained(cls.tmpdirname, **kwargs)
+    @classmethod
+    def get_image_processor(cls, **kwargs):
+        return ChineseCLIPImageProcessor.from_pretrained(cls.tmpdirname, **kwargs)
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
+    def test_save_load_pretrained_default(self):
+        tokenizer_slow = self.get_tokenizer()
+        tokenizer_fast = self.get_rust_tokenizer()
+        image_processor = self.get_image_processor()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            processor_slow = ChineseCLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
+            processor_slow.save_pretrained(tmpdir)
+            processor_slow = ChineseCLIPProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+            processor_fast = ChineseCLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
+            processor_fast.save_pretrained(tmpdir)
+            processor_fast = ChineseCLIPProcessor.from_pretrained(self.tmpdirname)
+        self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab())
+        self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
+        self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
+        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertIsInstance(processor_slow.image_processor, ChineseCLIPImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, ChineseCLIPImageProcessor)
+    def test_save_load_pretrained_additional_features(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            processor = ChineseCLIPProcessor(
+                tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()
+            )
+            processor.save_pretrained(tmpdir)
+            tokenizer_add_kwargs = self.get_tokenizer(cls_token="(CLS)", sep_token="(SEP)")
+            image_processor_add_kwargs = self.get_image_processor(do_normalize=False)
+            processor = ChineseCLIPProcessor.from_pretrained(
+                tmpdir, cls_token="(CLS)", sep_token="(SEP)", do_normalize=False
+            )
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, ChineseCLIPImageProcessor)
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        image_input = self.prepare_image_inputs()
+        input_feat_extract = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "Alexandra，T-shirt的价格是15便士。"
+        encoded_processor = processor(text=input_str)
+        encoded_tok = tokenizer(input_str)
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "Alexandra，T-shirt的价格是15便士。"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, images=image_input)
+        self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+        self.assertListEqual(decoded_tok, decoded_processor)
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = ChineseCLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "Alexandra，T-shirt的价格是15便士。"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, images=image_input)
+        self.assertListEqual(list(inputs.keys()), processor.model_input_names)

docs/transformers/tests/models/clap/__init__.py ADDED Viewed

File without changes

docs/transformers/tests/models/clap/test_feature_extraction_clap.py ADDED Viewed

	@@ -0,0 +1,546 @@

+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+import random
+import unittest
+import numpy as np
+from datasets import load_dataset
+from transformers import ClapFeatureExtractor
+from transformers.testing_utils import require_torch, require_torchaudio
+from transformers.trainer_utils import set_seed
+from transformers.utils.import_utils import is_torch_available
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+if is_torch_available():
+    import torch
+global_rng = random.Random()
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+    return values
+@require_torch
+@require_torchaudio
+# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->Clap
+class ClapFeatureExtractionTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=10,
+        hop_length=160,
+        chunk_length=8,
+        padding_value=0.0,
+        sampling_rate=4_000,
+        return_attention_mask=False,
+        do_normalize=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
+        self.do_normalize = do_normalize
+        self.feature_size = feature_size
+        self.chunk_length = chunk_length
+        self.hop_length = hop_length
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "hop_length": self.hop_length,
+            "chunk_length": self.chunk_length,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
+            "do_normalize": self.do_normalize,
+        }
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+        if equal_length:
+            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                floats_list((x, self.feature_size))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+        return speech_inputs
+@require_torch
+@require_torchaudio
+class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = ClapFeatureExtractor
+    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.setUp with Whisper->Clap
+    def setUp(self):
+        self.feat_extract_tester = ClapFeatureExtractionTester(self)
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+        # Test feature size
+        input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
+        self.assertTrue(input_features.ndim == 4)
+        # Test not batched input
+        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+        # Test batched
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad
+    def test_double_precision_pad(self):
+        import torch
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
+        py_speech_inputs = np_speech_inputs.tolist()
+        for inputs in [py_speech_inputs, np_speech_inputs]:
+            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_features.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
+    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest._load_datasamples
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        return [x["array"] for x in speech_samples]
+    def test_integration_fusion_short_input(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                [
+                    # "repeat"
+                    [
+                        -20.1049, -19.9764, -20.0731, -19.5055, -27.5018, -22.5761, -26.6071,
+                        -29.0091, -26.4659, -26.4236, -28.8808, -31.9190, -32.4848, -34.1186,
+                        -34.0340, -32.8803, -30.9895, -37.6238, -38.0347, -40.6263, -36.3496,
+                        -42.2533, -32.9132, -27.7068, -29.3704, -30.3208, -22.5972, -27.1494,
+                        -30.1975, -31.1005, -29.9372, -27.1917, -25.9806, -30.3489, -33.2380,
+                        -31.9062, -36.5498, -32.8721, -30.5629, -27.4674, -22.2232, -22.5653,
+                        -16.3868, -17.2713, -25.9738, -30.6256, -34.3766, -31.1292, -27.8950,
+                        -27.0588, -25.6206, -23.0712, -26.6050, -28.0112, -32.6847, -34.3396,
+                        -34.9738, -35.8463, -39.2324, -37.1188, -33.3705, -28.9230, -28.9112,
+                        -28.6578
+                    ],
+                    [
+                        -36.7233, -30.0587, -24.8431, -18.4611, -16.8149, -23.9319, -32.8580,
+                        -34.2264, -27.4332, -26.8027, -29.2721, -33.9033, -39.3403, -35.3232,
+                        -26.8076, -28.6460, -35.2780, -36.0738, -35.4996, -37.7631, -39.5056,
+                        -34.7112, -36.8741, -34.1066, -32.9474, -33.6604, -27.9937, -30.9594,
+                        -26.2928, -32.0485, -29.2151, -29.2917, -32.7308, -29.6542, -31.1454,
+                        -37.0088, -32.3388, -37.3086, -31.1024, -27.2889, -19.6788, -21.1488,
+                        -19.5144, -14.8889, -21.2006, -24.7488, -27.7940, -31.1058, -27.5068,
+                        -21.5737, -22.3780, -21.5151, -26.3086, -30.9223, -33.5043, -32.0307,
+                        -37.3806, -41.6188, -45.6650, -40.5131, -32.5023, -26.7385, -26.3709,
+                        -26.7761
+                    ]
+                ],
+                [
+                    # "repeatpad"
+                    [
+                        -25.7496, -24.9339, -24.1357, -23.1271, -23.7853, -26.1264, -29.1456,
+                        -33.2060, -37.8179, -42.4833, -41.9386, -41.2164, -42.3566, -44.2575,
+                        -40.0217, -36.6794, -36.6974, -38.7819, -42.0880, -45.5560, -39.9368,
+                        -36.3219, -35.5981, -36.6434, -35.1851, -33.0684, -30.0437, -30.2010,
+                        -34.3476, -42.1373, -38.8039, -37.3355, -40.4576, -41.0485, -40.6377,
+                        -38.2275, -42.7481, -34.6084, -34.7048, -29.5149, -26.3935, -26.8952,
+                        -34.1336, -26.2904, -28.2571, -32.5642, -36.7240, -35.5334, -38.2451,
+                        -34.8177, -28.9754, -25.1096, -27.9768, -32.3184, -37.0269, -40.5136,
+                        -40.8061, -36.4948, -40.3767, -38.9671, -38.3552, -34.1250, -30.9035,
+                        -31.6112
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ],
+                [
+                    # None, same as "repeatpad"
+                    [
+                        -25.7496, -24.9339, -24.1357, -23.1271, -23.7853, -26.1264, -29.1456,
+                        -33.2060, -37.8179, -42.4833, -41.9386, -41.2164, -42.3566, -44.2575,
+                        -40.0217, -36.6794, -36.6974, -38.7819, -42.0880, -45.5560, -39.9368,
+                        -36.3219, -35.5981, -36.6434, -35.1851, -33.0684, -30.0437, -30.2010,
+                        -34.3476, -42.1373, -38.8039, -37.3355, -40.4576, -41.0485, -40.6377,
+                        -38.2275, -42.7481, -34.6084, -34.7048, -29.5149, -26.3935, -26.8952,
+                        -34.1336, -26.2904, -28.2571, -32.5642, -36.7240, -35.5334, -38.2451,
+                        -34.8177, -28.9754, -25.1096, -27.9768, -32.3184, -37.0269, -40.5136,
+                        -40.8061, -36.4948, -40.3767, -38.9671, -38.3552, -34.1250, -30.9035,
+                        -31.6112
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ],
+                [
+                    # "pad"
+                    [
+                        -58.5260, -58.1155, -57.8623, -57.5059, -57.9178, -58.7171, -59.2343,
+                        -59.9833, -60.9764, -62.0722, -63.5723, -65.7111, -67.5153, -68.7088,
+                        -69.8325, -70.2987, -70.1548, -70.6233, -71.5702, -72.5159, -72.3821,
+                        -70.1817, -67.0315, -64.1387, -62.2202, -61.0717, -60.4951, -61.6005,
+                        -63.7358, -67.1400, -67.6185, -65.5635, -64.3593, -63.7138, -63.6209,
+                        -66.4950, -72.6284, -63.3961, -56.8334, -52.7319, -50.6310, -51.3728,
+                        -53.5619, -51.9190, -50.9708, -52.8684, -55.8073, -58.8227, -60.6991,
+                        -57.0547, -52.7611, -51.4388, -54.4892, -60.8950, -66.1024, -72.4352,
+                        -67.8538, -65.1463, -68.7588, -72.3080, -68.4864, -60.4688, -57.1516,
+                        -60.9460
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ]
+            ]
+        )
+        # fmt: on
+        MEL_BIN = [[976, 977], [976, 977], [976, 977], [196, 197]]
+        input_speech = self._load_datasamples(1)
+        feature_extractor = ClapFeatureExtractor()
+        for padding, EXPECTED_VALUES, idx_in_mel in zip(
+            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, MEL_BIN
+        ):
+            input_features = feature_extractor(input_speech, return_tensors="pt", padding=padding).input_features
+            self.assertEqual(input_features.shape, (1, 4, 1001, 64))
+            torch.testing.assert_close(input_features[0, 0, idx_in_mel[0]], EXPECTED_VALUES[0], rtol=1e-4, atol=1e-4)
+            torch.testing.assert_close(input_features[0, 0, idx_in_mel[1]], EXPECTED_VALUES[1], rtol=1e-4, atol=1e-4)
+            self.assertTrue(torch.all(input_features[0, 0] == input_features[0, 1]))
+            self.assertTrue(torch.all(input_features[0, 0] == input_features[0, 2]))
+            self.assertTrue(torch.all(input_features[0, 0] == input_features[0, 3]))
+    def test_integration_rand_trunc_short_input(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                [
+                    # "repeat"
+                    [
+                        -35.0483, -35.7865, -38.2884, -40.0220, -42.5349, -44.9489, -43.2228,
+                        -44.6499, -47.6253, -49.6983, -50.2127, -52.5483, -52.2223, -51.9157,
+                        -49.4082, -51.2024, -57.0476, -56.2803, -58.1618, -60.7474, -55.0389,
+                        -60.9514, -59.3080, -50.4419, -47.8172, -48.7570, -55.2552, -44.5036,
+                        -44.1148, -50.8218, -51.0968, -52.9408, -51.1037, -48.9789, -47.5897,
+                        -52.0915, -55.4216, -54.1529, -58.0149, -58.0866, -52.7798, -52.6154,
+                        -45.9144, -46.2008, -40.7603, -41.1703, -50.2250, -55.4112, -59.4818,
+                        -54.5795, -53.5552, -51.3668, -49.8358, -50.3186, -54.0452, -57.6030,
+                        -61.1589, -61.6415, -63.2756, -66.5890, -62.8543, -58.0665, -56.7203,
+                        -56.7632
+                    ],
+                    [
+                        -47.1320, -37.9961, -34.0076, -36.7109, -47.9057, -48.4924, -43.8371,
+                        -44.9728, -48.1689, -52.9141, -57.6077, -52.8520, -44.8502, -45.6764,
+                        -51.8389, -56.4284, -54.6972, -53.4889, -55.6077, -58.7149, -60.3760,
+                        -54.0136, -56.0730, -55.9870, -54.4017, -53.1094, -53.5640, -50.3064,
+                        -49.9520, -49.3239, -48.1668, -53.4852, -50.4561, -50.8688, -55.1970,
+                        -51.5538, -53.0260, -59.6933, -54.8183, -59.5895, -55.9589, -50.3761,
+                        -44.1282, -44.1463, -43.8540, -39.1168, -45.3893, -49.5542, -53.1505,
+                        -55.2870, -50.3921, -46.8511, -47.4444, -49.5633, -56.0034, -59.0815,
+                        -59.0018, -63.7589, -69.5745, -71.5789, -64.0498, -56.0558, -54.3475,
+                        -54.7004
+                    ]
+                ],
+                [
+                    # "repeatpad"
+                    [
+                        -40.3184, -39.7186, -39.8807, -41.6508, -45.3613, -50.4785, -57.0297,
+                        -60.4944, -59.1642, -58.9495, -60.4661, -62.5300, -58.4759, -55.2865,
+                        -54.8973, -56.0780, -57.5482, -59.6557, -64.3309, -65.0330, -59.4941,
+                        -56.8552, -55.0519, -55.9817, -56.9739, -55.2827, -54.5312, -51.4141,
+                        -50.4289, -51.9131, -57.5821, -63.9979, -59.9180, -58.9489, -62.3247,
+                        -62.6975, -63.7948, -60.5250, -64.6107, -58.7905, -57.0229, -54.3084,
+                        -49.8445, -50.4459, -57.0172, -50.6425, -52.5992, -57.4207, -61.6358,
+                        -60.6540, -63.1968, -57.4360, -52.3263, -51.7695, -57.1946, -62.9610,
+                        -66.7359, -67.0335, -63.7440, -68.1775, -66.3798, -62.8650, -59.8972,
+                        -59.3139
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ],
+                [
+                    # None, same as "repeatpad"
+                    [
+                        -40.3184, -39.7186, -39.8807, -41.6508, -45.3613, -50.4785, -57.0297,
+                        -60.4944, -59.1642, -58.9495, -60.4661, -62.5300, -58.4759, -55.2865,
+                        -54.8973, -56.0780, -57.5482, -59.6557, -64.3309, -65.0330, -59.4941,
+                        -56.8552, -55.0519, -55.9817, -56.9739, -55.2827, -54.5312, -51.4141,
+                        -50.4289, -51.9131, -57.5821, -63.9979, -59.9180, -58.9489, -62.3247,
+                        -62.6975, -63.7948, -60.5250, -64.6107, -58.7905, -57.0229, -54.3084,
+                        -49.8445, -50.4459, -57.0172, -50.6425, -52.5992, -57.4207, -61.6358,
+                        -60.6540, -63.1968, -57.4360, -52.3263, -51.7695, -57.1946, -62.9610,
+                        -66.7359, -67.0335, -63.7440, -68.1775, -66.3798, -62.8650, -59.8972,
+                        -59.3139
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ],
+                [
+                    # "pad"
+                    [
+                        -73.3190, -73.6349, -74.1451, -74.8539, -75.7476, -76.5438, -78.5540,
+                        -80.1339, -81.8911, -83.7560, -85.5387, -86.7466, -88.2072, -88.6090,
+                        -88.8243, -89.0784, -89.4364, -89.8179, -91.3146, -92.2833, -91.7221,
+                        -90.9440, -88.1315, -86.2425, -84.2281, -82.4893, -81.5993, -81.1328,
+                        -81.5759, -83.1068, -85.6525, -88.9520, -88.9187, -87.2703, -86.3052,
+                        -85.7188, -85.8802, -87.9996, -95.0464, -88.0133, -80.8561, -76.5597,
+                        -74.2816, -74.8109, -77.3615, -76.0719, -75.3426, -77.6428, -80.9663,
+                        -84.5275, -84.9907, -80.5205, -77.2851, -78.6259, -84.7740, -91.4535,
+                        -98.1894, -94.3872, -92.3735, -97.6807, -98.1501, -91.4344, -85.2842,
+                        -88.4338
+                    ],
+                    [
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
+                        -100., -100., -100., -100.
+                    ]
+                ]
+            ]
+        )
+        # fmt: on
+        MEL_BIN = [[976, 977], [976, 977], [976, 977], [196, 197]]
+        input_speech = self._load_datasamples(1)
+        feature_extractor = ClapFeatureExtractor()
+        for padding, EXPECTED_VALUES, idx_in_mel in zip(
+            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, MEL_BIN
+        ):
+            input_features = feature_extractor(
+                input_speech, return_tensors="pt", truncation="rand_trunc", padding=padding
+            ).input_features
+            self.assertEqual(input_features.shape, (1, 1, 1001, 64))
+            torch.testing.assert_close(input_features[0, 0, idx_in_mel[0]], EXPECTED_VALUES[0], rtol=1e-4, atol=1e-4)
+            torch.testing.assert_close(input_features[0, 0, idx_in_mel[1]], EXPECTED_VALUES[1], rtol=1e-4, atol=1e-4)
+    def test_integration_fusion_long_input(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                [
+                    -11.1830, -10.1894, -8.6051, -4.8578, -1.3268, -8.4606, -14.5453,
+                     -9.2017, 0.5781, 16.2129, 14.8289, 3.6326, -3.8794, -6.5544,
+                     -2.4408, 1.9531, 6.0967, 1.7590, -7.6730, -6.1571, 2.0052,
+                     16.6694, 20.6447, 21.2145, 13.4972, 15.9043, 16.8987, 4.1766,
+                     11.9428, 21.2372, 12.3016, 4.8604, 6.7241, 1.8543, 4.9235,
+                      5.3188, -0.9897, -1.2416, -6.5864, 2.9529, 2.9274, 6.4753,
+                     10.2300, 11.2127, 3.4042, -1.0055, -6.0475, -6.7524, -3.9801,
+                     -1.4434, 0.4740, -0.1584, -4.5457, -8.5746, -8.8428, -13.1475,
+                     -9.6079, -8.5798, -4.1143, -3.7966, -7.1651, -6.1517, -8.0258,
+                    -12.1486
+                ],
+                [
+                    -10.2017, -7.9924, -5.9517, -3.9372, -1.9735, -4.3130, 16.1647,
+                     25.0592, 23.5532, 14.4974, -7.0778, -10.2262, 6.4782, 20.3454,
+                     19.4269, 1.7976, -16.5070, 4.9380, 12.3390, 6.9285, -13.6325,
+                     -8.5298, 1.0839, -5.9629, -8.4812, 3.1331, -2.0963, -16.6046,
+                    -14.0070, -17.5707, -13.2080, -17.2168, -17.7770, -12.1111, -18.6184,
+                    -17.1897, -13.9801, -12.0426, -23.5400, -25.6823, -23.5813, -18.7847,
+                    -20.5473, -25.6458, -19.7585, -27.6007, -28.9276, -24.8948, -25.4458,
+                    -22.2807, -19.6613, -19.2669, -15.7813, -19.6821, -24.3439, -22.2598,
+                    -28.2631, -30.1017, -32.7646, -33.6525, -27.5639, -22.0548, -27.8054,
+                    -29.6947
+                ],
+                [
+                    -9.2078, -7.2963, -6.2095, -7.9959, -2.9280, -11.1843, -6.1490,
+                    5.0733, 19.2957, 21.4578, 14.6803, -3.3153, -6.3334, -2.3542,
+                    6.9509, 15.2965, 14.6620, 5.2075, -0.0873, 1.1919, 18.1986,
+                    20.8470, 10.8035, 2.2516, 7.6905, 7.7427, -1.2543, -5.0018,
+                    0.9809, -2.1584, -5.4580, -5.4760, -11.8888, -9.0605, -8.4638,
+                    -9.9897, -0.0540, -5.1629, 0.0483, -4.1504, -4.8140, -7.8236,
+                    -9.0622, -10.1742, -8.9597, -11.5380, -16.5603, -17.1858, -17.5032,
+                    -20.9326, -23.9543, -25.2602, -25.3429, -27.4536, -26.8859, -22.7852,
+                    -25.8288, -24.8399, -23.8893, -24.2096, -26.5415, -23.7281, -25.6851,
+                    -22.3629
+                ],
+                [
+                      1.3448, 2.9883, 4.0366, -0.8019, -10.4191, -10.0883, -4.3812,
+                      0.8136, 2.1579, 0.0832, 1.0949, -0.9759, -5.5319, -4.6009,
+                     -6.5452, -14.9155, -20.1584, -9.3611, -2.4271, 1.4031, 4.9910,
+                      8.6916, 8.6785, 10.1973, 9.9029, 5.3840, 7.5336, 5.2803,
+                      2.8144, -0.3138, 2.2216, 5.7328, 7.5574, 7.7402, 1.0681,
+                      3.1049, 7.0742, 6.5588, 7.3712, 5.7881, 8.6874, 8.7725,
+                      2.8133, -4.5809, -6.1317, -5.1719, -5.0192, -9.0977, -10.9391,
+                     -6.0769, 1.6016, -0.8965, -7.2252, -7.8632, -11.4468, -11.7446,
+                    -10.7447, -7.0601, -2.7748, -4.1798, -2.8433, -3.1352, 0.8097,
+                      6.4212
+                ]
+            ]
+        )
+        # fmt: on
+        MEL_BIN = 963
+        input_speech = torch.cat([torch.tensor(x) for x in self._load_datasamples(5)])
+        feature_extractor = ClapFeatureExtractor()
+        for padding, EXPECTED_VALUES, block_idx in zip(
+            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, [1, 2, 0, 3]
+        ):
+            set_seed(987654321)
+            input_features = feature_extractor(input_speech, return_tensors="pt", padding=padding).input_features
+            self.assertEqual(input_features.shape, (1, 4, 1001, 64))
+            torch.testing.assert_close(input_features[0, block_idx, MEL_BIN], EXPECTED_VALUES, rtol=1e-3, atol=1e-3)
+    def test_integration_rand_trunc_long_input(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                [
+                    -35.4022, -32.7555, -31.2004, -32.7764, -42.5770, -41.6339, -43.1630,
+                    -44.5080, -44.3029, -48.9628, -39.5022, -39.2105, -43.1350, -43.2195,
+                    -48.4894, -52.2344, -57.6891, -52.2228, -45.5155, -44.2893, -43.4697,
+                    -46.6702, -43.7490, -40.4819, -42.7275, -46.3434, -46.8412, -41.2003,
+                    -43.1681, -46.2948, -46.1925, -47.8333, -45.6812, -44.9182, -41.7786,
+                    -43.3809, -44.3199, -42.8814, -45.4771, -46.7114, -46.9746, -42.7090,
+                    -41.6057, -38.3965, -40.1980, -41.0263, -34.1256, -28.3289, -29.0201,
+                    -30.4453, -29.5561, -30.1734, -25.9406, -19.0897, -15.8452, -20.1351,
+                    -23.6515, -23.1194, -17.1845, -19.4399, -23.6527, -22.8768, -20.7279,
+                    -22.7864
+                ],
+                [
+                    -35.7719, -27.2566, -23.6964, -27.5521, 0.2510, 7.4391, 1.3917,
+                    -13.3417, -28.1758, -17.0856, -5.7723, -0.8000, -7.8832, -15.5548,
+                    -30.5935, -24.7571, -13.7009, -10.3432, -21.2464, -24.8118, -19.4080,
+                    -14.9779, -11.7991, -18.4485, -20.1982, -17.3652, -20.6328, -28.2967,
+                    -25.7819, -21.8962, -28.5083, -29.5719, -30.2120, -35.7033, -31.8218,
+                    -34.0408, -37.7744, -33.9653, -31.3009, -30.9063, -28.6153, -32.2202,
+                    -28.5456, -28.8579, -32.5170, -37.9152, -43.0052, -46.4849, -44.0786,
+                    -39.1933, -33.2757, -31.6313, -42.6386, -52.3679, -53.5785, -55.6444,
+                    -47.0050, -47.6459, -56.6361, -60.6781, -61.5244, -55.8272, -60.4832,
+                    -58.1897
+                ],
+                [
+                    -38.2686, -36.6285, -32.5835, -35.1693, -37.7938, -37.4035, -35.3132,
+                    -35.6083, -36.3609, -40.9472, -36.7846, -36.1544, -38.9076, -39.3618,
+                    -35.4953, -34.2809, -39.9466, -39.7433, -34.8347, -37.5674, -41.5689,
+                    -38.9161, -34.3947, -30.2924, -30.4841, -34.5831, -28.9261, -24.8849,
+                    -31.2324, -27.1622, -27.2107, -25.9385, -30.1691, -30.9223, -23.9495,
+                    -25.6047, -26.7119, -28.5523, -27.7481, -32.8427, -35.4650, -31.0399,
+                    -31.2073, -30.5163, -22.9819, -20.8892, -19.2510, -24.7905, -28.9426,
+                    -28.1998, -26.7386, -25.0140, -27.9223, -32.9913, -33.1864, -34.9742,
+                    -38.5995, -39.6990, -29.3203, -22.4697, -25.6415, -33.5608, -33.0945,
+                    -27.1716
+                ],
+                [
+                    -33.2015, -28.7741, -21.9457, -23.4888, -32.1072, -8.6307, 3.2724,
+                      5.9157, -0.9221, -30.1814, -31.0015, -27.4508, -27.0477, -9.5342,
+                      0.3221, 0.6511, -7.1596, -25.9707, -32.8924, -32.2300, -13.8974,
+                     -0.4895, 0.9168, -10.7663, -27.1176, -35.0829, -11.6859, -4.8855,
+                    -11.8898, -26.6167, -5.6192, -3.8443, -19.7947, -14.4101, -8.6236,
+                    -21.2458, -21.0801, -17.9136, -24.4663, -18.6333, -24.8085, -15.5854,
+                    -15.4344, -11.5046, -22.3625, -27.3387, -32.4353, -30.9670, -31.3789,
+                    -35.4044, -34.4591, -25.2433, -28.0773, -33.8736, -33.0224, -33.3155,
+                    -38.5302, -39.2741, -36.6395, -34.7729, -32.4483, -42.4001, -49.2857,
+                    -39.1682
+                ]
+            ]
+        )
+        # fmt: on
+        MEL_BIN = 963
+        SEEDS = [987654321, 1234, 666, 5555]
+        input_speech = torch.cat([torch.tensor(x) for x in self._load_datasamples(5)])
+        feature_extractor = ClapFeatureExtractor()
+        for padding, EXPECTED_VALUES, seed in zip(
+            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, SEEDS
+        ):
+            set_seed(seed)
+            input_features = feature_extractor(
+                input_speech, return_tensors="pt", truncation="rand_trunc", padding=padding
+            ).input_features
+            self.assertEqual(input_features.shape, (1, 1, 1001, 64))
+            torch.testing.assert_close(input_features[0, 0, MEL_BIN], EXPECTED_VALUES, rtol=1e-4, atol=1e-4)

docs/transformers/tests/models/clap/test_modeling_clap.py ADDED Viewed

	@@ -0,0 +1,755 @@

+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch CLAP model."""
+import inspect
+import os
+import tempfile
+import unittest
+import numpy as np
+from datasets import load_dataset
+from transformers import ClapAudioConfig, ClapConfig, ClapProcessor, ClapTextConfig
+from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.utils import is_torch_available
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+if is_torch_available():
+    import torch
+    from torch import nn
+    from transformers import (
+        ClapAudioModel,
+        ClapAudioModelWithProjection,
+        ClapModel,
+        ClapTextModel,
+        ClapTextModelWithProjection,
+    )
+class ClapAudioModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=60,
+        num_mel_bins=16,
+        window_size=4,
+        spec_size=64,
+        patch_size=2,
+        patch_stride=2,
+        seq_length=16,
+        freq_ratio=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        patch_embeds_hidden_size=16,
+        projection_dim=32,
+        depths=[2, 2],
+        num_hidden_layers=2,
+        num_heads=[2, 2],
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_mel_bins = num_mel_bins
+        self.window_size = window_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.depths = depths
+        self.num_heads = num_heads
+        self.num_attention_heads = num_heads[0]
+        self.seq_length = seq_length
+        self.spec_size = spec_size
+        self.freq_ratio = freq_ratio
+        self.patch_stride = patch_stride
+        self.patch_embeds_hidden_size = patch_embeds_hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+    def prepare_config_and_inputs(self):
+        input_features = floats_tensor([self.batch_size, 1, self.hidden_size, self.num_mel_bins])
+        config = self.get_config()
+        return config, input_features
+    def get_config(self):
+        return ClapAudioConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_mel_bins=self.num_mel_bins,
+            window_size=self.window_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            patch_stride=self.patch_stride,
+            projection_dim=self.projection_dim,
+            depths=self.depths,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+            spec_size=self.spec_size,
+            freq_ratio=self.freq_ratio,
+            patch_embeds_hidden_size=self.patch_embeds_hidden_size,
+        )
+    def create_and_check_model(self, config, input_features):
+        model = ClapAudioModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_features)
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+    def create_and_check_model_with_projection(self, config, input_features):
+        model = ClapAudioModelWithProjection(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_features)
+        self.parent.assertEqual(result.audio_embeds.shape, (self.batch_size, self.projection_dim))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_features = config_and_inputs
+        inputs_dict = {"input_features": input_features}
+        return config, inputs_dict
+@require_torch
+class ClapAudioModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLAP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+    all_model_classes = (ClapAudioModel, ClapAudioModelWithProjection) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    def setUp(self):
+        self.model_tester = ClapAudioModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ClapAudioConfig, has_text_modality=False, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    @unittest.skip(reason="ClapAudioModel does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            hidden_states = outputs.hidden_states
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [2 * self.model_tester.patch_embeds_hidden_size, 2 * self.model_tester.patch_embeds_hidden_size],
+            )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["input_features"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_model_with_projection(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
+    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
+    def test_training(self):
+        pass
+    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
+    def test_training_gradient_checkpointing(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "laion/clap-htsat-fused"
+        model = ClapAudioModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+    @slow
+    def test_model_with_projection_from_pretrained(self):
+        model_name = "laion/clap-htsat-fused"
+        model = ClapAudioModelWithProjection.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+        self.assertTrue(hasattr(model, "audio_projection"))
+class ClapTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+        projection_hidden_act="relu",
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.projection_hidden_act = projection_hidden_act
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+        config = self.get_config()
+        return config, input_ids, input_mask
+    def get_config(self):
+        return ClapTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            projection_hidden_act=self.projection_hidden_act,
+        )
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = ClapTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+    def create_and_check_model_with_projection(self, config, input_ids, input_mask):
+        model = ClapTextModelWithProjection(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+@require_torch
+class ClapTextModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (ClapTextModel, ClapTextModelWithProjection) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_head_masking = False
+    def setUp(self):
+        self.model_tester = ClapTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ClapTextConfig, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_model_with_projection(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
+    @unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass")
+    def test_training(self):
+        pass
+    @unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass")
+    def test_training_gradient_checkpointing(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+    @unittest.skip(reason="ClapTextModel does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "laion/clap-htsat-fused"
+        model = ClapTextModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+    @slow
+    def test_model_with_projection_from_pretrained(self):
+        model_name = "laion/clap-htsat-fused"
+        model = ClapTextModelWithProjection.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+        self.assertTrue(hasattr(model, "text_projection"))
+class ClapModelTester:
+    def __init__(self, parent, text_kwargs=None, audio_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if audio_kwargs is None:
+            audio_kwargs = {}
+        self.parent = parent
+        self.text_model_tester = ClapTextModelTester(parent, **text_kwargs)
+        self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.is_training = is_training
+    def prepare_config_and_inputs(self):
+        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        _, input_features = self.audio_model_tester.prepare_config_and_inputs()
+        config = self.get_config()
+        return config, input_ids, attention_mask, input_features
+    def get_config(self):
+        return ClapConfig.from_text_audio_configs(
+            self.text_model_tester.get_config(), self.audio_model_tester.get_config(), projection_dim=64
+        )
+    def create_and_check_model(self, config, input_ids, attention_mask, input_features):
+        model = ClapModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, input_features, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_audio.shape, (self.audio_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.audio_model_tester.batch_size)
+        )
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, input_features = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "input_features": input_features,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+@require_torch
+class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (ClapModel,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": ClapModel} if is_torch_available() else {}
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    def setUp(self):
+        self.model_tester = ClapModelTester(self)
+        common_properties = ["logit_scale_init_value", "projection_hidden_act", "projection_dim"]
+        self.config_tester = ConfigTester(
+            self, config_class=ClapConfig, has_text_modality=False, common_properties=common_properties
+        )
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+    @unittest.skip(reason="ClapModel does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+    # override as the `logit_scale` parameter initialization is different for CLAP
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initialized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            self.skipTest(reason="test_torchscript is set to False")
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            try:
+                input_ids = inputs_dict["input_ids"]
+                input_features = inputs_dict["input_features"]  # CLAP needs input_features
+                traced_model = torch.jit.trace(model, (input_ids, input_features))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+            model.to(torch_device)
+            model.eval()
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+            self.assertTrue(models_equal)
+    def test_load_audio_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        # Save ClapConfig and check if we can load ClapAudioConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            audio_config = ClapAudioConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.audio_config.to_dict(), audio_config.to_dict())
+        # Save ClapConfig and check if we can load ClapTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = ClapTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "laion/clap-htsat-fused"
+        model = ClapModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+@slow
+@require_torch
+class ClapModelIntegrationTest(unittest.TestCase):
+    paddings = ["repeatpad", "repeat", "pad"]
+    def test_integration_unfused(self):
+        EXPECTED_MEANS_UNFUSED = {
+            "repeatpad": 0.0024,
+            "pad": 0.0020,
+            "repeat": 0.0023,
+        }
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio_sample = librispeech_dummy[-1]
+        model_id = "laion/clap-htsat-unfused"
+        model = ClapModel.from_pretrained(model_id).to(torch_device)
+        processor = ClapProcessor.from_pretrained(model_id)
+        for padding in self.paddings:
+            inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding).to(
+                torch_device
+            )
+            audio_embed = model.get_audio_features(**inputs)
+            expected_mean = EXPECTED_MEANS_UNFUSED[padding]
+            self.assertTrue(
+                torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
+            )
+    def test_integration_fused(self):
+        EXPECTED_MEANS_FUSED = {
+            "repeatpad": 0.00069,
+            "repeat": 0.00196,
+            "pad": -0.000379,
+        }
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio_sample = librispeech_dummy[-1]
+        model_id = "laion/clap-htsat-fused"
+        model = ClapModel.from_pretrained(model_id).to(torch_device)
+        processor = ClapProcessor.from_pretrained(model_id)
+        for padding in self.paddings:
+            inputs = processor(
+                audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding, truncation="fusion"
+            ).to(torch_device)
+            audio_embed = model.get_audio_features(**inputs)
+            expected_mean = EXPECTED_MEANS_FUSED[padding]
+            self.assertTrue(
+                torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
+            )
+    def test_batched_fused(self):
+        EXPECTED_MEANS_FUSED = {
+            "repeatpad": 0.0010,
+            "repeat": 0.0020,
+            "pad": 0.0006,
+        }
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
+        model_id = "laion/clap-htsat-fused"
+        model = ClapModel.from_pretrained(model_id).to(torch_device)
+        processor = ClapProcessor.from_pretrained(model_id)
+        for padding in self.paddings:
+            inputs = processor(audios=audio_samples, return_tensors="pt", padding=padding, truncation="fusion").to(
+                torch_device
+            )
+            audio_embed = model.get_audio_features(**inputs)
+            expected_mean = EXPECTED_MEANS_FUSED[padding]
+            self.assertTrue(
+                torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
+            )
+    def test_batched_unfused(self):
+        EXPECTED_MEANS_FUSED = {
+            "repeatpad": 0.0016,
+            "repeat": 0.0019,
+            "pad": 0.0019,
+        }
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
+        model_id = "laion/clap-htsat-unfused"
+        model = ClapModel.from_pretrained(model_id).to(torch_device)
+        processor = ClapProcessor.from_pretrained(model_id)
+        for padding in self.paddings:
+            inputs = processor(audios=audio_samples, return_tensors="pt", padding=padding).to(torch_device)
+            audio_embed = model.get_audio_features(**inputs)
+            expected_mean = EXPECTED_MEANS_FUSED[padding]
+            self.assertTrue(
+                torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
+            )

docs/transformers/tests/models/clap/test_processor_clap.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import tempfile
+import unittest
+from transformers import ClapFeatureExtractor, ClapProcessor, RobertaTokenizer, RobertaTokenizerFast
+from transformers.testing_utils import require_sentencepiece, require_torchaudio
+from .test_feature_extraction_clap import floats_list
+@require_torchaudio
+@require_sentencepiece
+class ClapProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.checkpoint = "laion/clap-htsat-unfused"
+        self.tmpdirname = tempfile.mkdtemp()
+    def get_tokenizer(self, **kwargs):
+        return RobertaTokenizer.from_pretrained(self.checkpoint, **kwargs)
+    def get_feature_extractor(self, **kwargs):
+        return ClapFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor.save_pretrained(self.tmpdirname)
+        processor = ClapProcessor.from_pretrained(self.tmpdirname)
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
+    def test_save_load_pretrained_additional_features(self):
+        processor = ClapProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor.save_pretrained(self.tmpdirname)
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+        processor = ClapProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        raw_speech = floats_list((3, 1000))
+        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
+        input_processor = processor(audios=raw_speech, return_tensors="np")
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        input_str = "This is a test string"
+        encoded_processor = processor(text=input_str)
+        encoded_tok = tokenizer(input_str)
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+        self.assertListEqual(decoded_tok, decoded_processor)
+    def test_model_input_names(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.assertListEqual(
+            processor.model_input_names[2:],
+            feature_extractor.model_input_names,
+            msg="`processor` and `feature_extractor` model input names do not match",
+        )

docs/transformers/tests/models/clip/__init__.py ADDED Viewed

File without changes

docs/transformers/tests/models/clip/test_image_processing_clip.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright 2021 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torchvision_available, is_vision_available
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+if is_vision_available():
+    from transformers import CLIPImageProcessor
+    if is_torchvision_available():
+        from transformers import CLIPImageProcessorFast
+class CLIPImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_center_crop=True,
+        crop_size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        super().__init__()
+        size = size if size is not None else {"shortest_edge": 20}
+        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_center_crop": self.do_center_crop,
+            "crop_size": self.crop_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+@require_torch
+@require_vision
+class CLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = CLIPImageProcessor if is_vision_available() else None
+    fast_image_processing_class = CLIPImageProcessorFast if is_torchvision_available() else None
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = CLIPImageProcessingTester(self)
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "size"))
+            self.assertTrue(hasattr(image_processing, "do_center_crop"))
+            self.assertTrue(hasattr(image_processing, "center_crop"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+    def test_image_processor_from_dict_with_kwargs(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"shortest_edge": 20})
+            self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
+            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
+            self.assertEqual(image_processor.size, {"shortest_edge": 42})
+            self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})

docs/transformers/tests/models/clip/test_modeling_clip.py ADDED Viewed

	@@ -0,0 +1,948 @@

+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch CLIP model."""
+import inspect
+import os
+import tempfile
+import unittest
+import numpy as np
+import requests
+from parameterized import parameterized
+from pytest import mark
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+from transformers.testing_utils import (
+    require_flash_attn,
+    require_torch,
+    require_torch_gpu,
+    require_torch_sdpa,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import (
+    is_torch_available,
+    is_vision_available,
+)
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    is_flaky,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+if is_torch_available():
+    import torch
+    from torch import nn
+    from transformers import (
+        CLIPForImageClassification,
+        CLIPModel,
+        CLIPTextModel,
+        CLIPTextModelWithProjection,
+        CLIPVisionModel,
+        CLIPVisionModelWithProjection,
+    )
+if is_vision_available():
+    from PIL import Image
+    from transformers import CLIPProcessor
+class CLIPVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+        return config, pixel_values
+    def get_config(self):
+        return CLIPVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+    def create_and_check_model(self, config, pixel_values):
+        model = CLIPVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+    def create_and_check_model_with_projection(self, config, pixel_values):
+        model = CLIPVisionModelWithProjection(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @require_torch_sdpa
+    def test_eager_matches_sdpa_inference(self, *args):
+        return getattr(ModelTesterMixin, self._testMethodName)(self)
+class CLIPModelTesterMixin(ModelTesterMixin):
+    """
+    Subclass of ModelTesterMixin with methods specific to testing CLIP models.
+    The SDPA equivalence test is overridden here because CLIP models may have test/vision/text+vision inputs,
+    different output logits, and are not supposed to be used or tested with padding_side="left".
+    """
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                # Load the model with SDPA (it is the default, but we explicit it for clarity)
+                model_sdpa = model_class.from_pretrained(tmpdirname, attn_implementation="sdpa")
+                model_sdpa = model_sdpa.eval().to(torch_device)
+                # Load model with eager attention
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    attn_implementation="eager",
+                )
+                model_eager = model_eager.eval().to(torch_device)
+            if hasattr(model_sdpa, "vision_model"):
+                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa")
+                self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
+            if hasattr(model_sdpa, "text_model"):
+                self.assertTrue(model_sdpa.text_model.config._attn_implementation == "sdpa")
+                self.assertTrue(model_eager.text_model.config._attn_implementation == "eager")
+            self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+            self.assertTrue(model_eager.config._attn_implementation == "eager")
+@require_torch
+class CLIPVisionModelTest(CLIPModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+    all_model_classes = (CLIPVisionModel, CLIPVisionModelWithProjection) if is_torch_available() else ()
+    fx_compatible = True
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    def setUp(self):
+        self.model_tester = CLIPVisionModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    @unittest.skip(reason="CLIP does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_model_with_projection(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
+    @unittest.skip
+    def test_training(self):
+        pass
+    @unittest.skip
+    def test_training_gradient_checkpointing(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = CLIPVisionModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+    @slow
+    def test_model_with_projection_from_pretrained(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = CLIPVisionModelWithProjection.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+        self.assertTrue(hasattr(model, "visual_projection"))
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @require_torch_sdpa
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, *args):
+        # adding only flaky decorator here and call the parent test method
+        return getattr(ModelTesterMixin, self._testMethodName)(self)
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        super().test_sdpa_can_dispatch_composite_models()
+class CLIPTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+        config = self.get_config()
+        return config, input_ids, input_mask
+    def get_config(self):
+        return CLIPTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = CLIPTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+    def create_and_check_model_with_projection(self, config, input_ids, input_mask):
+        model = CLIPTextModelWithProjection(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+@require_torch
+class CLIPTextModelTest(CLIPModelTesterMixin, unittest.TestCase):
+    all_model_classes = (CLIPTextModel, CLIPTextModelWithProjection) if is_torch_available() else ()
+    fx_compatible = True
+    test_pruning = False
+    test_head_masking = False
+    model_split_percents = [0.5, 0.8, 0.9]
+    def setUp(self):
+        self.model_tester = CLIPTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_model_with_projection(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
+    @unittest.skip
+    def test_training(self):
+        pass
+    @unittest.skip
+    def test_training_gradient_checkpointing(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+    @unittest.skip(reason="CLIP does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = CLIPTextModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+    @slow
+    def test_model_with_projection_from_pretrained(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = CLIPTextModelWithProjection.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+        self.assertTrue(hasattr(model, "text_projection"))
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, *args):
+        # adding only flaky decorator here and call the parent test method
+        return getattr(ModelTesterMixin, self._testMethodName)(self)
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        super().test_sdpa_can_dispatch_composite_models()
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_on_flash(self):
+        self.skipTest(reason="CLIPTextModel has two attention masks: `causal_attention_mask` and `attention_mask`")
+class CLIPModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+        self.parent = parent
+        self.text_model_tester = CLIPTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = CLIPVisionModelTester(parent, **vision_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.is_training = is_training
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        config = self.get_config()
+        return config, input_ids, attention_mask, pixel_values
+    def get_config(self):
+        return CLIPConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = CLIPModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+@require_torch
+class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (CLIPModel,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": CLIPModel, "image-feature-extraction": CLIPVisionModel} if is_torch_available() else {}
+    )
+    fx_compatible = True
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    _is_composite = True
+    def setUp(self):
+        self.model_tester = CLIPModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=CLIPConfig, has_text_modality=False, common_properties=common_properties
+        )
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+    @unittest.skip(reason="CLIPModel does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+    # override as the `logit_scale` parameter initialization is different for CLIP
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initialized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            self.skipTest(reason="test_torchscript is set to False")
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # CLIP needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+            model.to(torch_device)
+            model.eval()
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+            self.assertTrue(models_equal)
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        # Save CLIPConfig and check if we can load CLIPVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = CLIPVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+        # Save CLIPConfig and check if we can load CLIPTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = CLIPTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = CLIPModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, *args):
+        # adding only flaky decorator here and call the parent test method
+        return getattr(ModelTesterMixin, self._testMethodName)(self)
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        super().test_sdpa_can_dispatch_composite_models()
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_on_flash(self):
+        self.skipTest(reason="CLIP text tower has two attention masks: `causal_attention_mask` and `attention_mask`")
+    @require_torch_sdpa
+    def test_sdpa_can_compile_dynamic(self):
+        self.skipTest(reason="CLIP model can't be compiled dynamic, error in clip_loss`")
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference_equivalence(self):
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+                )
+                model_fa.to(torch_device)
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
+                model.to(torch_device)
+                dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16)
+                dummy_input_ids = inputs_dict["input_ids"]
+                outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True)
+                outputs_fa = model_fa(
+                    pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True
+                )
+                self.assertTrue(
+                    torch.allclose(outputs.logits_per_image, outputs_fa.logits_per_image, atol=4e-2, rtol=4e-2),
+                    f"Image logits max diff: {torch.max(torch.abs(outputs.logits_per_image - outputs_fa.logits_per_image))}",
+                )
+                self.assertTrue(
+                    torch.allclose(outputs.logits_per_text, outputs_fa.logits_per_text, atol=4e-2, rtol=4e-2),
+                    f"Text logits max diff: {torch.max(torch.abs(outputs.logits_per_text - outputs_fa.logits_per_text))}",
+                )
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+                )
+                model_fa.to(torch_device)
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="eager"
+                )
+                model.to(torch_device)
+                dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16)
+                dummy_input_ids = inputs_dict["input_ids"]
+                dummy_pixel_mask = inputs_dict["attention_mask"]
+                # right padding
+                dummy_pixel_mask[:] = 1
+                dummy_pixel_mask[:, -1:] = 0
+                outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True)
+                outputs_fa = model_fa(
+                    pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True
+                )
+                logits_per_image_eager = outputs.logits_per_image[:, :-1]
+                logits_per_text_eager = outputs.logits_per_text[:, :-1]
+                logits_per_image_sdpa = outputs_fa.logits_per_image[:, :-1]
+                logits_per_text_sdpa = outputs_fa.logits_per_text[:, :-1]
+                self.assertTrue(
+                    torch.allclose(logits_per_image_eager, logits_per_image_sdpa, atol=4e-2, rtol=4e-2),
+                    f"Image logits max diff: {torch.max(torch.abs(logits_per_image_eager - logits_per_image_sdpa))}",
+                )
+                self.assertTrue(
+                    torch.allclose(logits_per_text_eager, logits_per_text_sdpa, atol=4e-2, rtol=4e-2),
+                    f"Text logits max diff: {torch.max(torch.abs(logits_per_text_eager - logits_per_text_sdpa))}",
+                )
+class CLIPForImageClassificationModelTester(CLIPModelTester):
+    def __init__(self, parent):
+        super().__init__(parent)
+        self.batch_size = self.vision_model_tester.batch_size
+        self.num_hidden_layers = self.vision_model_tester.num_hidden_layers
+        self.hidden_size = self.vision_model_tester.hidden_size
+        self.seq_length = self.vision_model_tester.seq_length
+    def prepare_config_and_inputs(self):
+        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        config = self.get_config()
+        return config, pixel_values
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+@require_torch
+class CLIPForImageClassificationModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (CLIPForImageClassification,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-classification": CLIPForImageClassification} if is_torch_available() else {}
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    _is_composite = True
+    def setUp(self):
+        self.model_tester = CLIPForImageClassificationModelTester(self)
+    @unittest.skip(reason="CLIPForImageClassification does not support inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    @unittest.skip(reason="CLIPForImageClassification does not support inputs_embeds")
+    def test_model_get_set_embeddings(self):
+        pass
+    @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet")
+    def test_training_gradient_checkpointing(self):
+        pass
+    @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+    @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+    @unittest.skip(reason="CLIP uses the same initialization scheme as the Flax original implementation")
+    def test_initialization(self):
+        pass
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, *args):
+        # adding only flaky decorator here and call the parent test method
+        return getattr(ModelTesterMixin, self._testMethodName)(self)
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        super().test_sdpa_can_dispatch_composite_models()
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+@require_vision
+@require_torch
+class CLIPModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = CLIPModel.from_pretrained(model_name, attn_implementation="sdpa").to(torch_device)
+        processor = CLIPProcessor.from_pretrained(model_name)
+        image = prepare_img()
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
+        ).to(torch_device)
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+        expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
+        torch.testing.assert_close(outputs.logits_per_image, expected_logits, rtol=1e-3, atol=1e-3)
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        # CLIP models have an `interpolate_pos_encoding` argument in their forward method,
+        # allowing to interpolate the pre-trained position embeddings in order to use
+        # the model on higher resolutions. The DINO model by Facebook AI leverages this
+        # to visualize self-attention on higher resolution images.
+        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device)
+        processor = CLIPProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180}
+        )
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
+        # interpolate_pos_encodiung false should return value error
+        with self.assertRaises(ValueError, msg="doesn't match model"):
+            with torch.no_grad():
+                model(**inputs, interpolate_pos_encoding=False)
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs, interpolate_pos_encoding=True)
+        # verify the logits
+        expected_shape = torch.Size((1, 26, 768))
+        self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]]
+        ).to(torch_device)
+        torch.testing.assert_close(
+            outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=6e-3, atol=4e-4
+        )

docs/transformers/tests/models/clip/test_modeling_flax_clip.py ADDED Viewed

	@@ -0,0 +1,468 @@

+import inspect
+import tempfile
+import unittest
+import numpy as np
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow
+from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+if is_flax_available():
+    import jax
+    from transformers.models.clip.modeling_flax_clip import (
+        FlaxCLIPModel,
+        FlaxCLIPTextModel,
+        FlaxCLIPTextModelWithProjection,
+        FlaxCLIPVisionModel,
+    )
+class FlaxCLIPVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = CLIPVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+        return config, pixel_values
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+@require_flax
+class FlaxCLIPVisionModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+    all_model_classes = (FlaxCLIPVisionModel,) if is_flax_available() else ()
+    def setUp(self):
+        self.model_tester = FlaxCLIPVisionModelTester(self)
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+                @jax.jit
+                def model_jitted(pixel_values, **kwargs):
+                    return model(pixel_values=pixel_values, **kwargs).to_tuple()
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict)
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict)
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs, outputs):
+                    self.assertEqual(jitted_output.shape, output.shape)
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            hidden_states = outputs.hidden_states
+            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
+            # CLIP has a different seq_length
+            image_size = (self.model_tester.image_size, self.model_tester.image_size)
+            patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+            seq_length = num_patches + 1
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.model_tester.image_size, self.model_tester.image_size)
+        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_length = num_patches + 1
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_length, seq_length],
+            )
+            out_len = len(outputs)
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_length, seq_length],
+            )
+    # FlaxCLIPVisionModel does not have any base model
+    def test_save_load_from_base(self):
+        pass
+    # FlaxCLIPVisionModel does not have any base model
+    def test_save_load_to_base(self):
+        pass
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True)
+            outputs = model(np.ones((1, 3, 224, 224)))
+            self.assertIsNotNone(outputs)
+class FlaxCLIPTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+        config = CLIPTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+        return config, input_ids, input_mask
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+@require_flax
+class FlaxCLIPTextModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxCLIPTextModel, FlaxCLIPTextModelWithProjection) if is_flax_available() else ()
+    def setUp(self):
+        self.model_tester = FlaxCLIPTextModelTester(self)
+    # FlaxCLIPTextModel does not have any base model
+    def test_save_load_from_base(self):
+        pass
+    # FlaxCLIPVisionModel does not have any base model
+    def test_save_load_to_base(self):
+        pass
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True)
+            outputs = model(np.ones((1, 1)))
+            self.assertIsNotNone(outputs)
+class FlaxCLIPModelTester:
+    def __init__(self, parent, is_training=True):
+        self.parent = parent
+        self.text_model_tester = FlaxCLIPTextModelTester(parent)
+        self.vision_model_tester = FlaxCLIPVisionModelTester(parent)
+        self.is_training = is_training
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        config = CLIPConfig.from_text_vision_configs(text_config, vision_config, projection_dim=64)
+        return config, input_ids, attention_mask, pixel_values
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+        }
+        return config, inputs_dict
+@require_flax
+class FlaxCLIPModelTest(FlaxModelTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxCLIPModel,) if is_flax_available() else ()
+    test_attention_outputs = False
+    def setUp(self):
+        self.model_tester = FlaxCLIPModelTester(self)
+    # hidden_states are tested in individual model tests
+    def test_hidden_states_output(self):
+        pass
+    def test_jit_compilation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                model = model_class(config)
+                @jax.jit
+                def model_jitted(input_ids, pixel_values, **kwargs):
+                    return model(input_ids=input_ids, pixel_values=pixel_values, **kwargs).to_tuple()
+                with self.subTest("JIT Enabled"):
+                    jitted_outputs = model_jitted(**prepared_inputs_dict)
+                with self.subTest("JIT Disabled"):
+                    with jax.disable_jit():
+                        outputs = model_jitted(**prepared_inputs_dict)
+                self.assertEqual(len(outputs), len(jitted_outputs))
+                for jitted_output, output in zip(jitted_outputs[:4], outputs[:4]):
+                    self.assertEqual(jitted_output.shape, output.shape)
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.__call__)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["input_ids", "pixel_values", "attention_mask", "position_ids"]
+            self.assertListEqual(arg_names[:4], expected_arg_names)
+    def test_get_image_features(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = FlaxCLIPModel(config)
+        @jax.jit
+        def model_jitted(pixel_values):
+            return model.get_image_features(pixel_values=pixel_values)
+        with self.subTest("JIT Enabled"):
+            jitted_output = model_jitted(inputs_dict["pixel_values"])
+        with self.subTest("JIT Disabled"):
+            with jax.disable_jit():
+                output = model_jitted(inputs_dict["pixel_values"])
+        self.assertEqual(jitted_output.shape, output.shape)
+        self.assertTrue(np.allclose(jitted_output, output, atol=1e-3))
+    def test_get_text_features(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = FlaxCLIPModel(config)
+        @jax.jit
+        def model_jitted(input_ids, attention_mask, **kwargs):
+            return model.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
+        with self.subTest("JIT Enabled"):
+            jitted_output = model_jitted(**inputs_dict)
+        with self.subTest("JIT Disabled"):
+            with jax.disable_jit():
+                output = model_jitted(**inputs_dict)
+        self.assertEqual(jitted_output.shape, output.shape)
+        self.assertTrue(np.allclose(jitted_output, output, atol=1e-3))
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("openai/clip-vit-base-patch32", from_pt=True)
+            outputs = model(input_ids=np.ones((1, 1)), pixel_values=np.ones((1, 3, 224, 224)))
+            self.assertIsNotNone(outputs)
+    # overwrite from common since FlaxCLIPModel returns nested output
+    # which is not supported in the common test
+    def test_from_pretrained_save_pretrained(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            if model_class.__name__ != "FlaxBertModel":
+                continue
+            with self.subTest(model_class.__name__):
+                model = model_class(config)
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                outputs = model(**prepared_inputs_dict).to_tuple()
+                # verify that normal save_pretrained works as expected
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    model.save_pretrained(tmpdirname)
+                    model_loaded = model_class.from_pretrained(tmpdirname)
+                outputs_loaded = model_loaded(**prepared_inputs_dict).to_tuple()[:4]
+                for output_loaded, output in zip(outputs_loaded, outputs):
+                    self.assert_almost_equals(output_loaded, output, 1e-3)
+                # verify that save_pretrained for distributed training
+                # with `params=params` works as expected
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    model.save_pretrained(tmpdirname, params=model.params)
+                    model_loaded = model_class.from_pretrained(tmpdirname)
+                outputs_loaded = model_loaded(**prepared_inputs_dict).to_tuple()[:4]
+                for output_loaded, output in zip(outputs_loaded, outputs):
+                    self.assert_almost_equals(output_loaded, output, 1e-3)

docs/transformers/tests/models/clip/test_modeling_tf_clip.py ADDED Viewed

	@@ -0,0 +1,662 @@

+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the TensorFlow CLIP model."""
+from __future__ import annotations
+import inspect
+import os
+import tempfile
+import unittest
+from importlib import import_module
+import requests
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+from transformers.testing_utils import require_tf, require_vision, slow
+from transformers.utils import is_tf_available, is_vision_available
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TFCLIPModel, TFCLIPTextModel, TFCLIPVisionModel, TFSharedEmbeddings
+    from transformers.modeling_tf_utils import keras
+if is_vision_available():
+    from PIL import Image
+    from transformers import CLIPProcessor
+class TFCLIPVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+        return config, pixel_values
+    def get_config(self):
+        return CLIPVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+    def create_and_check_model(self, config, pixel_values):
+        model = TFCLIPVisionModel(config=config)
+        result = model(pixel_values, training=False)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+@require_tf
+class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+    all_model_classes = (TFCLIPVisionModel,) if is_tf_available() else ()
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_onnx = False
+    def setUp(self):
+        self.model_tester = TFCLIPVisionModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_inputs_embeds(self):
+        # CLIP does not use inputs_embeds
+        pass
+    def test_graph_mode_with_inputs_embeds(self):
+        # CLIP does not use inputs_embeds
+        pass
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, keras.layers.Layer))
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.model_tester.image_size, self.model_tester.image_size)
+        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_len = num_patches + 1
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+            out_len = len(outputs)
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+            added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+            self_attentions = outputs.attentions
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, seq_len, seq_len],
+            )
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+            # CLIP has a different seq_length
+            image_size = (self.model_tester.image_size, self.model_tester.image_size)
+            patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+            seq_length = num_patches + 1
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = TFCLIPVisionModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+    @slow
+    def test_saved_model_creation_extended(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+        if hasattr(config, "use_cache"):
+            config.use_cache = True
+        # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.model_tester.image_size, self.model_tester.image_size)
+        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_len = num_patches + 1
+        for model_class in self.all_model_classes:
+            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            model = model_class(config)
+            num_out = len(model(class_inputs_dict))
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=True)
+                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
+                model = keras.models.load_model(saved_model_dir)
+                outputs = model(class_inputs_dict)
+                output_hidden_states = outputs["hidden_states"]
+                output_attentions = outputs["attentions"]
+                # Check num outputs
+                self.assertEqual(len(outputs), num_out)
+                # Check num layers
+                expected_num_layers = getattr(
+                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                )
+                self.assertEqual(len(output_hidden_states), expected_num_layers)
+                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
+                # Check attention outputs
+                image_size = (self.model_tester.image_size, self.model_tester.image_size)
+                patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+                num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+                seq_len = num_patches + 1
+                self.assertListEqual(
+                    list(output_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, seq_len, seq_len],
+                )
+                # Check hidden states
+                self.assertListEqual(
+                    list(output_hidden_states[0].shape[-2:]),
+                    [seq_len, self.model_tester.hidden_size],
+                )
+class TFCLIPTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            # make sure the first token has attention mask `1` to ensure that, after combining the causal mask, there
+            # is still at least one token being attended to for each batch.
+            # TODO: Change `random_attention_mask` in PT/TF/Flax common test file, after a discussion with the team.
+            input_mask = tf.concat(
+                [tf.ones_like(input_mask[:, :1], dtype=input_mask.dtype), input_mask[:, 1:]], axis=-1
+            )
+        config = self.get_config()
+        return config, input_ids, input_mask
+    def get_config(self):
+        return CLIPTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = TFCLIPTextModel(config=config)
+        result = model(input_ids, attention_mask=input_mask, training=False)
+        result = model(input_ids, training=False)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+@require_tf
+class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TFCLIPTextModel,) if is_tf_available() else ()
+    test_pruning = False
+    test_head_masking = False
+    test_onnx = False
+    def setUp(self):
+        self.model_tester = TFCLIPTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_inputs_embeds(self):
+        # CLIP does not use inputs_embeds
+        pass
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = TFCLIPTextModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+    @slow
+    def test_saved_model_creation_extended(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+        if hasattr(config, "use_cache"):
+            config.use_cache = True
+        for model_class in self.all_model_classes:
+            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            model = model_class(config)
+            num_out = len(model(class_inputs_dict))
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=True)
+                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
+                model = keras.models.load_model(saved_model_dir)
+                outputs = model(class_inputs_dict)
+                output_hidden_states = outputs["hidden_states"]
+                output_attentions = outputs["attentions"]
+                # Check number of outputs
+                self.assertEqual(len(outputs), num_out)
+                # Check number of layers
+                expected_num_layers = getattr(
+                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                )
+                # Check hidden states
+                self.assertEqual(len(output_hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(output_hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                )
+                # Check attention outputs
+                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
+                seq_length = self.model_tester.seq_length
+                key_length = getattr(self.model_tester, "key_length", seq_length)
+                self.assertListEqual(
+                    list(output_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, seq_length, key_length],
+                )
+class TFCLIPModelTester:
+    def __init__(self, parent, is_training=True):
+        self.parent = parent
+        self.text_model_tester = TFCLIPTextModelTester(parent)
+        self.vision_model_tester = TFCLIPVisionModelTester(parent)
+        self.is_training = is_training
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        config = self.get_config()
+        return config, input_ids, attention_mask, pixel_values
+    def get_config(self):
+        return CLIPConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = TFCLIPModel(config)
+        result = model(input_ids, pixel_values, attention_mask, training=False)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+@require_tf
+class TFCLIPModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (TFCLIPModel,) if is_tf_available() else ()
+    pipeline_model_mapping = {"feature-extraction": TFCLIPModel} if is_tf_available() else {}
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_onnx = False
+    def setUp(self):
+        self.model_tester = TFCLIPModelTester(self)
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    # hidden_states are tested in individual model tests
+    def test_hidden_states_output(self):
+        pass
+    # input_embeds are tested in individual model tests
+    def test_inputs_embeds(self):
+        pass
+    # CLIPModel does not have input/output embeddings
+    def test_model_common_attributes(self):
+        pass
+    # overwrite from common since `TFCLIPModelTester` set `return_loss` to `True` and causes the preparation of
+    # `symbolic_inputs` failed.
+    def test_keras_save_load(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        # remove `return_loss` to make code work
+        if self.__class__.__name__ == "TFCLIPModelTest":
+            inputs_dict.pop("return_loss", None)
+        tf_main_layer_classes = {
+            module_member
+            for model_class in self.all_model_classes
+            for module in (import_module(model_class.__module__),)
+            for module_member_name in dir(module)
+            if module_member_name.endswith("MainLayer")
+            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
+            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
+            for module_member in (getattr(module, module_member_name),)
+            if isinstance(module_member, type)
+            and keras.layers.Layer in module_member.__bases__
+            and getattr(module_member, "_keras_serializable", False)
+        }
+        for main_layer_class in tf_main_layer_classes:
+            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
+            if "T5" in main_layer_class.__name__:
+                # Take the same values than in TFT5ModelTester for this shared layer
+                shared = TFSharedEmbeddings(99, 32, name="shared")
+                config.use_cache = inputs_dict.pop("use_cache", None)
+                main_layer = main_layer_class(config, embed_tokens=shared)
+            else:
+                main_layer = main_layer_class(config)
+            symbolic_inputs = {
+                name: keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
+            }
+            model = keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
+            outputs = model(inputs_dict)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                filepath = os.path.join(tmpdirname, "keras_model.h5")
+                model.save(filepath)
+                if "T5" in main_layer_class.__name__:
+                    model = keras.models.load_model(
+                        filepath,
+                        custom_objects={
+                            main_layer_class.__name__: main_layer_class,
+                            "TFSharedEmbeddings": TFSharedEmbeddings,
+                        },
+                    )
+                else:
+                    model = keras.models.load_model(
+                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
+                    )
+                assert isinstance(model, keras.Model)
+                after_outputs = model(inputs_dict)
+                self.assert_outputs_same(after_outputs, outputs)
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = TFCLIPModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+    @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
+    @slow
+    def test_saved_model_creation(self):
+        pass
+    @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
+    @slow
+    def test_saved_model_creation_extended(self):
+        pass
+    @unittest.skip(reason="`saved_model` doesn't work with nested outputs so no preparation happens.")
+    @slow
+    def test_prepare_serving_output(self):
+        pass
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+@require_vision
+@require_tf
+class TFCLIPModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "openai/clip-vit-base-patch32"
+        model = TFCLIPModel.from_pretrained(model_name)
+        processor = CLIPProcessor.from_pretrained(model_name)
+        image = prepare_img()
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="tf"
+        )
+        outputs = model(**inputs, training=False)
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            tf.TensorShape((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            tf.TensorShape((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+        expected_logits = tf.constant([[24.5701, 19.3049]])
+        tf.debugging.assert_near(outputs.logits_per_image, expected_logits, atol=1e-3)

docs/transformers/tests/models/clip/test_processor_clip.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import shutil
+import tempfile
+import unittest
+import pytest
+from transformers import CLIPTokenizer, CLIPTokenizerFast
+from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+from ...test_processing_common import ProcessorTesterMixin
+if is_vision_available():
+    from transformers import CLIPImageProcessor, CLIPProcessor
+@require_vision
+class CLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = CLIPProcessor
+    @classmethod
+    def setUpClass(cls):
+        cls.tmpdirname = tempfile.mkdtemp()
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]  # fmt: skip
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""]
+        cls.special_tokens_map = {"unk_token": "<unk>"}
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+        image_processor_map = {
+            "do_resize": True,
+            "size": 20,
+            "do_center_crop": True,
+            "crop_size": 18,
+            "do_normalize": True,
+            "image_mean": [0.48145466, 0.4578275, 0.40821073],
+            "image_std": [0.26862954, 0.26130258, 0.27577711],
+        }
+        cls.image_processor_file = os.path.join(cls.tmpdirname, IMAGE_PROCESSOR_NAME)
+        with open(cls.image_processor_file, "w", encoding="utf-8") as fp:
+            json.dump(image_processor_map, fp)
+    @classmethod
+    def get_tokenizer(cls, **kwargs):
+        return CLIPTokenizer.from_pretrained(cls.tmpdirname, **kwargs)
+    @classmethod
+    def get_rust_tokenizer(cls, **kwargs):
+        return CLIPTokenizerFast.from_pretrained(cls.tmpdirname, **kwargs)
+    @classmethod
+    def get_image_processor(cls, **kwargs):
+        return CLIPImageProcessor.from_pretrained(cls.tmpdirname, **kwargs)
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.tmpdirname)
+    def test_save_load_pretrained_default(self):
+        tokenizer_slow = self.get_tokenizer()
+        tokenizer_fast = self.get_rust_tokenizer()
+        image_processor = self.get_image_processor()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            processor_slow = CLIPProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
+            processor_slow.save_pretrained(tmpdir)
+            processor_slow = CLIPProcessor.from_pretrained(tmpdir, use_fast=False)
+            processor_fast = CLIPProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
+            processor_fast.save_pretrained(tmpdir)
+            processor_fast = CLIPProcessor.from_pretrained(tmpdir)
+        self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab())
+        self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
+        self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
+        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertIsInstance(processor_slow.image_processor, CLIPImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, CLIPImageProcessor)
+    def test_save_load_pretrained_additional_features(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            processor = CLIPProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+            processor.save_pretrained(tmpdir)
+            tokenizer_add_kwargs = CLIPTokenizer.from_pretrained(tmpdir, bos_token="(BOS)", eos_token="(EOS)")
+            image_processor_add_kwargs = CLIPImageProcessor.from_pretrained(
+                tmpdir, do_normalize=False, padding_value=1.0
+            )
+            processor = CLIPProcessor.from_pretrained(
+                tmpdir, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+            )
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        image_input = self.prepare_image_inputs()
+        input_image_proc = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+        for key in input_image_proc.keys():
+            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "lower newer"
+        encoded_processor = processor(text=input_str)
+        encoded_tok = tokenizer(input_str)
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, images=image_input)
+        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+        self.assertListEqual(decoded_tok, decoded_processor)
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = CLIPProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, images=image_input)
+        self.assertListEqual(list(inputs.keys()), processor.model_input_names)

docs/transformers/tests/models/clip/test_tokenization_clip.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import unittest
+from functools import lru_cache
+from transformers import CLIPTokenizer, CLIPTokenizerFast
+from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_ftfy, require_tokenizers
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
+@require_tokenizers
+class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    from_pretrained_id = "openai/clip-vit-base-patch32"
+    tokenizer_class = CLIPTokenizer
+    rust_tokenizer_class = CLIPTokenizerFast
+    test_rust_tokenizer = True
+    from_pretrained_kwargs = {}
+    test_seq2seq = False
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]  # fmt: skip
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
+        cls.special_tokens_map = {"unk_token": "<unk>"}
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return CLIPTokenizer.from_pretrained(pretrained_name, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return CLIPTokenizerFast.from_pretrained(pretrained_name, **kwargs)
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer"
+        return input_text, output_text
+    def test_full_tokenizer(self):
+        tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower newer"
+        bpe_tokens = ["lo", "w", "er</w>", "n", "e", "w", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [10, 2, 16, 9, 3, 2, 16, 20]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+    @require_ftfy
+    def test_check_encoding_slow_fast(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_s = self.get_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
+                text_tokenized_s = tokenizer_s.tokenize(text)
+                text_tokenized_r = tokenizer_r.tokenize(text)
+                self.assertListEqual(text_tokenized_s, text_tokenized_r)
+                # Test that the tokenization is identical on an example containing a character (Latin Small Letter A
+                # with Tilde) encoded in 2 different ways
+                text = "xa\u0303y" + " " + "x\xe3y"
+                text_tokenized_s = tokenizer_s.tokenize(text)
+                text_tokenized_r = tokenizer_r.tokenize(text)
+                self.assertListEqual(text_tokenized_s, text_tokenized_r)
+                # Test that the tokenization is identical on unicode of space type
+                spaces_unicodes = [
+                    "\u0009",  # (horizontal tab, '\t')
+                    "\u000b",  # (vertical tab)
+                    "\u000c",  # (form feed)
+                    "\u0020",  # (space, ' ')
+                    "\u200e",  # (left-to-right mark):w
+                    "\u200f",  # (right-to-left mark)
+                ]
+                for unicode_seq in spaces_unicodes:
+                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
+                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
+                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
+                # Test that the tokenization is identical on unicode of line break type
+                line_break_unicodes = [
+                    "\u000a",  # (line feed, '\n')
+                    "\r\n",  # (carriage return and line feed, '\r\n')
+                    "\u000d",  # (carriage return, '\r')
+                    "\r",  # (carriage return, '\r')
+                    "\u000d",  # (carriage return, '\r')
+                    "\u2028",  # (line separator)
+                    "\u2029",  # (paragraph separator)
+                    # "\u0085", # (next line)
+                ]
+                # The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms
+                # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
+                # space (and thus into an empty list).
+                for unicode_seq in line_break_unicodes:
+                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
+                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
+                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
+    def test_offsets_mapping_with_different_add_prefix_space_argument(self):
+        # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space`
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                text_of_1_token = "hello"  # `hello` is a token in the vocabulary of `pretrained_name`
+                text = f"{text_of_1_token} {text_of_1_token}"
+                tokenizer_r = self.get_rust_tokenizer(
+                    pretrained_name,
+                    use_fast=True,
+                )
+                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
+                self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
+                self.assertEqual(
+                    encoding.offset_mapping[1],
+                    (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
+                )
+                text = f" {text}"
+                tokenizer_r = self.get_rust_tokenizer(
+                    pretrained_name,
+                    use_fast=True,
+                )
+                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
+                self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
+                self.assertEqual(
+                    encoding.offset_mapping[1],
+                    (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
+                )
+    def test_log_warning(self):
+        # Test related to the breaking change introduced in transformers v4.17.0
+        # We need to check that an error in raised when the user try to load a previous version of the tokenizer.
+        with self.assertRaises(ValueError) as context:
+            self.get_rust_tokenizer("robot-test/old-clip-tokenizer")
+        self.assertTrue(
+            context.exception.args[0].startswith(
+                "The `backend_tokenizer` provided does not match the expected format."
+            )
+        )
+    @require_ftfy
+    def test_tokenization_python_rust_equals(self):
+        super().test_tokenization_python_rust_equals()
+    @unittest.skip(reason="CLIP always lower cases letters")
+    def test_added_tokens_do_lower_case(self):
+        pass

docs/transformers/tests/models/clipseg/__init__.py ADDED Viewed

File without changes

docs/transformers/tests/models/clipseg/test_modeling_clipseg.py ADDED Viewed

	@@ -0,0 +1,714 @@

+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch CLIPSeg model."""
+import inspect
+import os
+import tempfile
+import unittest
+import numpy as np
+import requests
+from transformers import CLIPSegConfig, CLIPSegProcessor, CLIPSegTextConfig, CLIPSegVisionConfig
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_vision_available
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+if is_torch_available():
+    import torch
+    from torch import nn
+    from transformers import CLIPSegForImageSegmentation, CLIPSegModel, CLIPSegTextModel, CLIPSegVisionModel
+    from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
+if is_vision_available():
+    from PIL import Image
+class CLIPSegVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+        return config, pixel_values
+    def get_config(self):
+        return CLIPSegVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+    def create_and_check_model(self, config, pixel_values):
+        model = CLIPSegVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+@require_torch
+class CLIPSegVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as CLIPSeg does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+    all_model_classes = (CLIPSegVisionModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    def setUp(self):
+        self.model_tester = CLIPSegVisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=CLIPSegVisionConfig, has_text_modality=False, hidden_size=37
+        )
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    @unittest.skip(reason="CLIPSeg does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    @unittest.skip
+    def test_training(self):
+        pass
+    @unittest.skip
+    def test_training_gradient_checkpointing(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "CIDAS/clipseg-rd64-refined"
+        model = CLIPSegVisionModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+class CLIPSegTextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+        config = self.get_config()
+        return config, input_ids, input_mask
+    def get_config(self):
+        return CLIPSegTextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = CLIPSegTextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+@require_torch
+class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (CLIPSegTextModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_head_masking = False
+    model_split_percents = [0.5, 0.8, 0.9]
+    def setUp(self):
+        self.model_tester = CLIPSegTextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CLIPSegTextConfig, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    @unittest.skip
+    def test_training(self):
+        pass
+    @unittest.skip
+    def test_training_gradient_checkpointing(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+    @unittest.skip(reason="CLIPSeg does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "CIDAS/clipseg-rd64-refined"
+        model = CLIPSegTextModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+class CLIPSegModelTester:
+    def __init__(
+        self,
+        parent,
+        text_kwargs=None,
+        vision_kwargs=None,
+        is_training=True,
+        # This should respect the `num_hidden_layers` in `CLIPSegVisionModelTester`
+        extract_layers=(1,),
+    ):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+        self.parent = parent
+        self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.is_training = is_training
+        self.extract_layers = extract_layers
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        config = self.get_config()
+        return config, input_ids, attention_mask, pixel_values
+    def get_config(self):
+        return CLIPSegConfig.from_text_vision_configs(
+            self.text_model_tester.get_config(),
+            self.vision_model_tester.get_config(),
+            projection_dim=64,
+            reduce_dim=32,
+            extract_layers=self.extract_layers,
+        )
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = CLIPSegModel(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+    def create_and_check_model_for_image_segmentation(self, config, input_ids, attention_maks, pixel_values):
+        model = CLIPSegForImageSegmentation(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values)
+        self.parent.assertEqual(
+            result.logits.shape,
+            (
+                self.vision_model_tester.batch_size,
+                self.vision_model_tester.image_size,
+                self.vision_model_tester.image_size,
+            ),
+        )
+        self.parent.assertEqual(
+            result.conditional_embeddings.shape, (self.text_model_tester.batch_size, config.projection_dim)
+        )
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+        }
+        return config, inputs_dict
+@require_torch
+class CLIPSegModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (CLIPSegModel, CLIPSegForImageSegmentation) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": CLIPSegModel} if is_torch_available() else {}
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        # CLIPSegForImageSegmentation requires special treatment
+        if return_labels:
+            if model_class.__name__ == "CLIPSegForImageSegmentation":
+                batch_size, _, height, width = inputs_dict["pixel_values"].shape
+                inputs_dict["labels"] = torch.zeros(
+                    [batch_size, height, width], device=torch_device, dtype=torch.float
+                )
+        return inputs_dict
+    def setUp(self):
+        self.model_tester = CLIPSegModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=CLIPSegConfig, has_text_modality=False, common_properties=common_properties
+        )
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_model_for_image_segmentation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs)
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+    @unittest.skip(reason="CLIPSegModel does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+    # override as the some parameters require custom initialization
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initialized as per the original implementation
+                    if "logit_scale" in name:
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    elif "film" in name or "transposed_conv" in name or "reduce" in name:
+                        # those parameters use PyTorch' default nn.Linear initialization scheme
+                        pass
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            self.skipTest(reason="test_torchscript is set to False")
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # CLIPSeg needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+            model.to(torch_device)
+            model.eval()
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+            self.assertTrue(models_equal)
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        # Save CLIPSegConfig and check if we can load CLIPSegVisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = CLIPSegVisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+        # Save CLIPSegConfig and check if we can load CLIPSegTextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = CLIPSegTextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+    def test_training(self):
+        if not self.model_tester.is_training:
+            self.skipTest(reason="Training test is skipped as the model was not trained")
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+            if model_class.__name__ in MODEL_MAPPING_NAMES.values():
+                continue
+            print("Model class:", model_class)
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            for k, v in inputs.items():
+                print(k, v.shape)
+            loss = model(**inputs).loss
+            loss.backward()
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "CIDAS/clipseg-rd64-refined"
+        model = CLIPSegModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+@require_vision
+@require_torch
+class CLIPSegModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_image_segmentation(self):
+        model_name = "CIDAS/clipseg-rd64-refined"
+        processor = CLIPSegProcessor.from_pretrained(model_name)
+        model = CLIPSegForImageSegmentation.from_pretrained(model_name).to(torch_device)
+        image = prepare_img()
+        texts = ["a cat", "a remote", "a blanket"]
+        inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt").to(torch_device)
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # verify the predicted masks
+        self.assertEqual(
+            outputs.logits.shape,
+            torch.Size((3, 352, 352)),
+        )
+        expected_masks_slice = torch.tensor(
+            [[-7.4613, -7.4785, -7.3628], [-7.3268, -7.0899, -7.1333], [-6.9838, -6.7900, -6.8913]]
+        ).to(torch_device)
+        torch.testing.assert_close(outputs.logits[0, :3, :3], expected_masks_slice, rtol=1e-3, atol=1e-3)
+        # verify conditional and pooled output
+        expected_conditional = torch.tensor([0.5601, -0.0314, 0.1980]).to(torch_device)
+        expected_pooled_output = torch.tensor([0.5036, -0.2681, -0.2644]).to(torch_device)
+        torch.testing.assert_close(outputs.conditional_embeddings[0, :3], expected_conditional, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(outputs.pooled_output[0, :3], expected_pooled_output, rtol=1e-3, atol=1e-3)
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        # ViT models have an `interpolate_pos_encoding` argument in their forward method,
+        # allowing to interpolate the pre-trained position embeddings in order to use
+        # the model on higher resolutions. The DINO model by Facebook AI leverages this
+        # to visualize self-attention on higher resolution images.
+        model = CLIPSegModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device)
+        processor = CLIPSegProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180}
+        )
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
+        # interpolate_pos_encodiung false should return value error
+        with self.assertRaises(ValueError, msg="doesn't match model"):
+            with torch.no_grad():
+                model(**inputs, interpolate_pos_encoding=False)
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs, interpolate_pos_encoding=True)
+        # verify the logits
+        expected_shape = torch.Size((1, 26, 768))
+        self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
+        expected_slice = torch.tensor(
+            [[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]]
+        ).to(torch_device)
+        torch.testing.assert_close(
+            outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4
+        )

docs/transformers/tests/models/clipseg/test_processor_clipseg.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import shutil
+import tempfile
+import unittest
+import pytest
+from transformers import CLIPTokenizer, CLIPTokenizerFast
+from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+from ...test_processing_common import ProcessorTesterMixin
+if is_vision_available():
+    from transformers import CLIPSegProcessor, ViTImageProcessor
+@require_vision
+class CLIPSegProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = CLIPSegProcessor
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]  # fmt: skip
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+        image_processor_map = {
+            "do_resize": True,
+            "size": 20,
+            "do_center_crop": True,
+            "crop_size": 18,
+            "do_normalize": True,
+            "image_mean": [0.48145466, 0.4578275, 0.40821073],
+            "image_std": [0.26862954, 0.26130258, 0.27577711],
+        }
+        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
+        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
+            json.dump(image_processor_map, fp)
+    def get_tokenizer(self, **kwargs):
+        return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    def get_rust_tokenizer(self, **kwargs):
+        return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+    def get_image_processor(self, **kwargs):
+        return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+    def test_save_load_pretrained_default(self):
+        tokenizer_slow = self.get_tokenizer()
+        tokenizer_fast = self.get_rust_tokenizer()
+        image_processor = self.get_image_processor()
+        processor_slow = CLIPSegProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
+        processor_slow.save_pretrained(self.tmpdirname)
+        processor_slow = CLIPSegProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+        processor_fast = CLIPSegProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
+        processor_fast.save_pretrained(self.tmpdirname)
+        processor_fast = CLIPSegProcessor.from_pretrained(self.tmpdirname)
+        self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab())
+        self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab())
+        self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
+        self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
+        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertIsInstance(processor_slow.image_processor, ViTImageProcessor)
+        self.assertIsInstance(processor_fast.image_processor, ViTImageProcessor)
+    def test_save_load_pretrained_additional_features(self):
+        processor = CLIPSegProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
+        processor.save_pretrained(self.tmpdirname)
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+        processor = CLIPSegProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, ViTImageProcessor)
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        image_input = self.prepare_image_inputs()
+        input_feat_extract = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "lower newer"
+        encoded_processor = processor(text=input_str)
+        encoded_tok = tokenizer(input_str)
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+    def test_processor_text(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, images=image_input)
+        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+    def test_processor_visual_prompt(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        image_input = self.prepare_image_inputs()
+        visual_prompt_input = self.prepare_image_inputs()
+        inputs = processor(images=image_input, visual_prompt=visual_prompt_input)
+        self.assertListEqual(list(inputs.keys()), ["pixel_values", "conditional_pixel_values"])
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+        self.assertListEqual(decoded_tok, decoded_processor)

docs/transformers/tests/models/clvp/__init__.py ADDED Viewed

File without changes

docs/transformers/tests/models/clvp/test_feature_extraction_clvp.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+import os
+import random
+import tempfile
+import unittest
+import numpy as np
+from datasets import Audio, load_dataset
+from transformers import ClvpFeatureExtractor
+from transformers.testing_utils import (
+    check_json_file_has_correct_format,
+    cleanup,
+    require_torch,
+    slow,
+    torch_device,
+)
+from transformers.utils.import_utils import is_torch_available
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+if is_torch_available():
+    import torch
+global_rng = random.Random()
+# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+    return values
+@require_torch
+class ClvpFeatureExtractionTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=10,
+        hop_length=160,
+        chunk_length=8,
+        padding_value=0.0,
+        sampling_rate=4_000,
+        return_attention_mask=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
+        self.feature_size = feature_size
+        self.chunk_length = chunk_length
+        self.hop_length = hop_length
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "hop_length": self.hop_length,
+            "chunk_length": self.chunk_length,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
+        }
+    # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester.prepare_inputs_for_common
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+        if equal_length:
+            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                floats_list((x, self.feature_size))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+        return speech_inputs
+@require_torch
+class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = ClvpFeatureExtractor
+    def setUp(self):
+        self.feat_extract_tester = ClvpFeatureExtractionTester(self)
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        cleanup(torch_device)
+    # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_from_and_save_pretrained
+    def test_feat_extract_from_and_save_pretrained(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
+            check_json_file_has_correct_format(saved_file)
+            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = feat_extract_first.mel_filters
+        mel_2 = feat_extract_second.mel_filters
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+    # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_to_json_file
+    def test_feat_extract_to_json_file(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
+            feat_extract_first.to_json_file(json_file_path)
+            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = feat_extract_first.mel_filters
+        mel_2 = feat_extract_second.mel_filters
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+        # Test feature size
+        input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
+        self.assertTrue(input_features.ndim == 3)
+        self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
+        # Test not batched input
+        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+        # Test batched
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+        # Test truncation required
+        speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+        speech_inputs_truncated = [x[: feature_extractor.n_samples] for x in speech_inputs]
+        np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
+        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+    # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad
+    def test_double_precision_pad(self):
+        import torch
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
+        py_speech_inputs = np_speech_inputs.tolist()
+        for inputs in [py_speech_inputs, np_speech_inputs]:
+            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_features.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = ds.cast_column("audio", Audio(sampling_rate=22050))
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
+    @slow
+    def test_integration(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                0.9271, 1.1405, 1.4419, 1.2470, 1.2438, 1.1787, 1.0595, 1.0570, 1.1070,
+                1.2205, 1.2376, 1.2997, 1.1131, 1.0843, 1.0459, 1.1858, 1.2323, 1.3582,
+                1.3401, 1.3770, 1.4173, 1.3381, 1.2291, 1.0854, 1.2116, 1.1873, 1.2178,
+                1.2137, 1.3001, 1.4274
+            ]
+        )
+        # fmt: on
+        input_speech, sr = self._load_datasamples(1)
+        feature_extractor = ClvpFeatureExtractor.from_pretrained("susnato/clvp_dev")
+        input_features = feature_extractor(input_speech, sampling_rate=sr[0], return_tensors="pt").input_features
+        self.assertEqual(input_features.shape, (1, 80, 517))
+        torch.testing.assert_close(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, rtol=1e-4, atol=1e-4)

docs/transformers/tests/models/clvp/test_modeling_clvp.py ADDED Viewed

	@@ -0,0 +1,640 @@

+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Clvp model."""
+import tempfile
+import unittest
+import datasets
+import numpy as np
+from transformers import ClvpConfig, ClvpDecoderConfig, ClvpEncoderConfig
+from transformers.testing_utils import (
+    cleanup,
+    require_torch,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+if is_torch_available():
+    import torch
+    from transformers import ClvpEncoder, ClvpForCausalLM, ClvpModel, ClvpModelForConditionalGeneration
+from transformers import ClvpFeatureExtractor, ClvpTokenizer
+class ClvpEncoderTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=7,
+        is_training=False,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=50,
+        hidden_size=128,
+        projection_dim=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=32,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+    def get_config(self):
+        encoder_config = ClvpEncoderConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+        )
+        return encoder_config
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+        encoder_config = self.get_config()
+        return encoder_config, input_ids, input_mask
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        speech_config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids.to(torch_device), "attention_mask": input_mask.to(torch_device)}
+        return speech_config, inputs_dict
+    def create_and_check_model(self, speech_config, input_ids, input_mask):
+        text_config = ClvpEncoderConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+        text_encoder_model = ClvpEncoder(config=text_config)
+        text_encoder_model.to(torch_device)
+        text_encoder_model.eval()
+        with torch.no_grad():
+            result = text_encoder_model(input_ids, attention_mask=input_mask)
+            result = text_encoder_model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim))
+        # now check with speech config
+        speech_encoder_model = ClvpEncoder(config=speech_config)
+        speech_encoder_model.to(torch_device)
+        speech_encoder_model.eval()
+        with torch.no_grad():
+            result = speech_encoder_model(input_ids, attention_mask=input_mask)
+            result = speech_encoder_model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim))
+@require_torch
+class ClvpEncoderTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (ClvpEncoder,) if is_torch_available() else ()
+    test_pruning = False
+    test_head_masking = False
+    test_torchscript = False
+    def setUp(self):
+        self.model_tester = ClvpEncoderTester(self)
+        self.encoder_config_tester = ConfigTester(self, config_class=ClvpEncoderConfig, hidden_size=32)
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        cleanup(torch_device)
+    def test_config(self):
+        self.encoder_config_tester.run_common_tests()
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    @unittest.skip(reason="ClvpEncoder does not output loss")
+    def test_training(self):
+        pass
+    @unittest.skip(reason="ClvpEncoder does not output loss")
+    def test_training_gradient_checkpointing(self):
+        pass
+class ClvpDecoderTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=3,
+        is_training=False,
+        vocab_size=300,
+        max_position_embeddings=256,
+        max_text_tokens=256,
+        use_input_mask=True,
+        hidden_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        bos_token_id=97,
+        eos_token_id=98,
+        relative_attention_num_buckets=4,
+        relative_attention_max_distance=16,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.max_text_tokens = max_text_tokens
+        self.use_input_mask = use_input_mask
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+    def get_config(self):
+        decoder_config = ClvpDecoderConfig(
+            vocab_size=self.vocab_size,
+            max_position_embeddings=self.max_position_embeddings,
+            max_text_tokens=self.max_text_tokens,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            relative_attention_num_buckets=self.relative_attention_num_buckets,
+            relative_attention_max_distance=self.relative_attention_max_distance,
+        )
+        return decoder_config
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+        decoder_config = self.get_config()
+        return decoder_config, input_ids, input_mask
+    def create_and_check_model(self, config, input_ids, attention_mask):
+        model = ClvpForCausalLM(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids=input_ids, attention_mask=attention_mask)
+        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.vocab_size))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids.to(torch_device),
+            "attention_mask": attention_mask.to(torch_device),
+        }
+        return config, inputs_dict
+@require_torch
+class ClvpDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (ClvpModel, ClvpForCausalLM) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": ClvpModelForConditionalGeneration} if is_torch_available() else {}
+    test_pruning = False
+    def setUp(self):
+        self.model_tester = ClvpDecoderTester(self)
+        self.decoder_config_tester = ConfigTester(self, config_class=ClvpDecoderConfig, hidden_size=32)
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        cleanup(torch_device)
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        if return_labels and model_class == ClvpForCausalLM:
+            inputs_dict["labels"] = torch.zeros(
+                [self.model_tester.batch_size, self.model_tester.seq_length], device=torch_device
+            ).long()
+        return inputs_dict
+    def test_training(self):
+        # we will only test the ClvpForCausalLM since it outputs loss
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+        model = ClvpForCausalLM(config)
+        model.to(torch_device)
+        model.train()
+        inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True)
+        loss = model(**inputs).loss
+        loss.backward()
+    def test_training_gradient_checkpointing(self):
+        # we will only test the ClvpForCausalLM since it outputs loss
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.use_cache = False
+        config.return_dict = True
+        model = ClvpForCausalLM(config)
+        model.to(torch_device)
+        model.gradient_checkpointing_enable()
+        model.train()
+        inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True)
+        loss = model(**inputs).loss
+        loss.backward()
+    @unittest.skip(reason="Clvp `prepare_inputs_for_generation` function doesn't have cache position.")
+    def test_generate_continue_from_inputs_embeds(self):
+        pass
+class ClvpModelForConditionalGenerationTester:
+    def __init__(self, parent, is_training=False):
+        self.parent = parent
+        self.clvp_encoder_tester = ClvpEncoderTester(parent)
+        self.is_training = is_training
+        self.batch_size = self.clvp_encoder_tester.batch_size  # need bs for batching_equivalence test
+    def get_config(self):
+        decoder_config = ClvpDecoderConfig(
+            vocab_size=50,
+            max_position_embeddings=30,
+            max_text_tokens=30,
+            hidden_size=128,
+            num_hidden_layers=1,
+            num_attention_heads=2,
+            bos_token_id=97,
+            eos_token_id=98,
+            relative_attention_num_buckets=4,
+            relative_attention_max_distance=16,
+        )
+        text_config = self.clvp_encoder_tester.get_config()
+        speech_config = self.clvp_encoder_tester.get_config()
+        speech_config.vocab_size = 300
+        return ClvpConfig.from_sub_model_configs(
+            text_config,
+            speech_config,
+            decoder_config,
+            projection_dim=16,
+        )
+    def prepare_config_and_inputs(self):
+        _, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs()
+        ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
+        _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
+        feature_extractor = ClvpFeatureExtractor()
+        input_features = feature_extractor(raw_speech=audio, sampling_rate=sr, return_tensors="pt")[
+            "input_features"
+        ].to(torch_device)
+        config = self.get_config()
+        return config, input_ids, attention_mask, input_features
+    def create_and_check_model(self, config, input_ids, attention_mask, input_features):
+        model = ClvpModelForConditionalGeneration(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids=input_ids, input_features=input_features, attention_mask=attention_mask)
+        self.parent.assertEqual(result.logits_per_speech.shape, (2, self.clvp_encoder_tester.batch_size))
+        self.parent.assertEqual(result.logits_per_text.shape, (self.clvp_encoder_tester.batch_size, 2))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, input_features = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids.to(torch_device),
+            "attention_mask": attention_mask.to(torch_device),
+            "input_features": input_features.to(torch_device),
+            "return_loss": False,
+        }
+        return config, inputs_dict
+@require_torch
+class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (ClvpModelForConditionalGeneration,) if is_torch_available() else ()
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    test_torchscript = False
+    def setUp(self):
+        self.model_tester = ClvpModelForConditionalGenerationTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.clvp_config_tester = ConfigTester(
+            self, config_class=ClvpConfig, has_text_modality=False, common_properties=common_properties, hidden_size=32
+        )
+    def test_config(self):
+        self.clvp_config_tester.run_common_tests()
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        cleanup(torch_device)
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            # check for decoder model, text encoder model and speech encoder model hidden states
+            decoder_hidden_states = outputs.decoder_hidden_states
+            text_encoder_hidden_states = outputs.text_encoder_hidden_states
+            speech_encoder_hidden_states = outputs.speech_encoder_hidden_states
+            # check length of the hidden states
+            expected_decoder_num_layers = config.decoder_config.num_hidden_layers + 1
+            self.assertEqual(len(decoder_hidden_states), expected_decoder_num_layers)
+            expected_speech_encoder_num_layers = config.text_config.num_hidden_layers + 1
+            self.assertEqual(len(text_encoder_hidden_states), expected_speech_encoder_num_layers)
+            expected_text_encoder_num_layers = config.speech_config.num_hidden_layers + 1
+            self.assertEqual(len(speech_encoder_hidden_states), expected_text_encoder_num_layers)
+            # check shapes of each hidden state
+            # for the decoder model we will only test the dimension because the ClvpConditioningEncoder could increase
+            # the sequence lengths.
+            self.assertEqual(decoder_hidden_states[0].shape[-1], config.decoder_config.hidden_size)
+            # the testing for text encoder stays standard because we just pass the text tokens here.
+            self.assertListEqual(
+                list(text_encoder_hidden_states[0].shape[-2:]),
+                [self.model_tester.clvp_encoder_tester.seq_length, config.text_config.hidden_size],
+            )
+            # for the decoder model we will only test the dimension because the fix_decoder_outputs method could increase
+            # the sequence lengths by adding `decoder_fixing_codes` tokens at the end.
+            self.assertEqual(speech_encoder_hidden_states[0].shape[-1], config.speech_config.hidden_size)
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+    @unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings")
+    def test_inputs_embeds(self):
+        pass
+    @unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+    # override as the `logit_scale` parameter initialization is different for Clvp
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initialized as per the original implementation
+                    if name == "logit_scale":
+                        expected_value = np.log(1 / 0.07)
+                        returned_value = param.data.item()
+                        self.assertAlmostEqual(
+                            returned_value,
+                            expected_value,
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        expected_range = [0.0, 1.0]
+                        returned_range = ((param.data.mean() * 1e9).round() / 1e9).item()
+                        self.assertIn(
+                            returned_range,
+                            expected_range,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+    def test_load_speech_text_decoder_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        # Save ClvpConfig and check if we can load ClvpEncoderConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            encoder_config = ClvpEncoderConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), encoder_config.to_dict())
+        # Save ClvpConfig and check if we can load ClvpDecoderConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            decoder_config = ClvpDecoderConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.decoder_config.to_dict(), decoder_config.to_dict())
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "susnato/clvp_dev"
+        model = ClvpModelForConditionalGeneration.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+# Since Clvp has a lot of different models connected with each other it's better to test each of them individually along
+# with a test_full_model_integration. If the model breaks in future, it could be of a great help to identify the broken part.
+@slow
+@require_torch
+class ClvpIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.text = "This is an example text."
+        ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
+        _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
+        self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device)
+        self.model.eval()
+        tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev")
+        feature_extractor = ClvpFeatureExtractor.from_pretrained("susnato/clvp_dev")
+        tokenizer_output = tokenizer(self.text, return_tensors="pt")
+        self.text_tokens = tokenizer_output["input_ids"].to(torch_device)
+        self.input_features = feature_extractor(
+            raw_speech=self.speech_samples, sampling_rate=self.sr, return_tensors="pt"
+        )["input_features"].to(torch_device)
+    def tearDown(self):
+        super().tearDown()
+        # clean-up as much as possible GPU memory occupied by PyTorch
+        cleanup(torch_device, gc_collect=True)
+    def test_conditional_encoder(self):
+        with torch.no_grad():
+            conditioning_encoder_outputs = self.model.conditioning_encoder(
+                input_features=self.input_features, input_ids=self.text_tokens
+            ).to("cpu")
+        self.assertEqual(
+            conditioning_encoder_outputs.shape,
+            torch.Size((self.input_features.shape[0], 18, self.model.config.decoder_config.hidden_size)),
+        )
+        EXPECTED_OUTPUTS = torch.tensor(
+            [[-0.8582, 0.5228, 1.9944], [-0.0465, -1.1017, -0.0093], [-0.0466, -0.6030, -0.1280]]
+        )
+        torch.testing.assert_close(conditioning_encoder_outputs[0, :3, :3], EXPECTED_OUTPUTS, rtol=1e-4, atol=1e-4)
+    def test_decoder_model_generate(self):
+        autoregressive_model_output = self.model.speech_decoder_model.generate(input_ids=self.text_tokens).cpu()
+        EXPECTED_OUTPUTS = torch.tensor([[147, 2, 54, 2, 43, 2, 169, 122, 29, 64, 2, 136, 37, 33, 9, 8193]])
+        torch.testing.assert_close(autoregressive_model_output, EXPECTED_OUTPUTS)
+    def test_text_and_speech_encoder_models(self):
+        # check for text embeds
+        text_embeds = self.model.text_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu()
+        # fmt: off
+        EXPECTED_TEXT_EMBEDS = torch.tensor([1.4798, -2.0005, 2.3902, -0.5042, 1.6401, -2.4135, -1.4800, 3.0118, -2.4422, 1.3266, 2.2339, 1.4761, -4.8983, -1.3592, 6.0251, 6.7364, 2.2576, 3.7229, -10.0436, 4.6676])
+        # fmt: on
+        torch.testing.assert_close(text_embeds[0, :20], EXPECTED_TEXT_EMBEDS, rtol=1e-4, atol=1e-4)
+        # check for speech embeds
+        speech_embeds = self.model.speech_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu()
+        # fmt: off
+        EXPECTED_SPEECH_EMBEDS = torch.tensor([3.1202, -3.1183, -1.4264, -6.1339, 1.8885, -0.1983, 0.9461, -1.7414, 0.3320, -3.8400, -1.5715, 1.5096, -1.7576, 0.2387, 4.9758, 5.8450, -6.2534, 2.8587, -5.5816, 4.7821])
+        # fmt: on
+        torch.testing.assert_close(speech_embeds[0, :20], EXPECTED_SPEECH_EMBEDS, rtol=1e-4, atol=1e-4)
+    def test_full_model_integration(self):
+        full_model_output = self.model.generate(
+            input_ids=self.text_tokens,
+            input_features=self.input_features,
+            do_sample=False,
+            num_beams=4,
+            num_return_sequences=4,
+            max_new_tokens=10,
+        )
+        EXPECTED_SPEECH_IDS = torch.tensor([[1953, 1080, 612], [1953, 612, 493], [1953, 612, 716]])
+        EXPECTED_SIMILARITY_SCORES = torch.tensor([[14.7660, 14.4569, 13.6472, 13.5683]])
+        torch.testing.assert_close(full_model_output.speech_ids.cpu()[-3:, -3:], EXPECTED_SPEECH_IDS)
+        torch.testing.assert_close(full_model_output.logits_per_text.cpu(), EXPECTED_SIMILARITY_SCORES)

docs/transformers/tests/models/clvp/test_processor_clvp.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import shutil
+import tempfile
+import unittest
+from transformers import ClvpFeatureExtractor, ClvpProcessor, ClvpTokenizer
+from transformers.testing_utils import require_torch
+from .test_feature_extraction_clvp import floats_list
+@require_torch
+class ClvpProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.checkpoint = "susnato/clvp_dev"
+        self.tmpdirname = tempfile.mkdtemp()
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmpdirname)
+        gc.collect()
+    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.get_tokenizer with Whisper->Clvp
+    def get_tokenizer(self, **kwargs):
+        return ClvpTokenizer.from_pretrained(self.checkpoint, **kwargs)
+    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.get_feature_extractor with Whisper->Clvp
+    def get_feature_extractor(self, **kwargs):
+        return ClvpFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
+    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_save_load_pretrained_default with Whisper->Clvp
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+        processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        processor.save_pretrained(self.tmpdirname)
+        processor = ClvpProcessor.from_pretrained(self.tmpdirname)
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, ClvpTokenizer)
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, ClvpFeatureExtractor)
+    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_feature_extractor with Whisper->Clvp,processor(raw_speech->processor(raw_speech=raw_speech
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+        processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        raw_speech = floats_list((3, 1000))
+        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
+        input_processor = processor(raw_speech=raw_speech, return_tensors="np")
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer with Whisper->Clvp
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+        processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        input_str = "This is a test string"
+        encoded_processor = processor(text=input_str)
+        encoded_tok = tokenizer(input_str)
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer_decode with Whisper->Clvp
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+        processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+        self.assertListEqual(decoded_tok, decoded_processor)
+    def test_save_load_pretrained_additional_features(self):
+        processor = ClvpProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor.save_pretrained(self.tmpdirname)
+        tokenizer_add_kwargs = self.get_tokenizer(pad_token="(PAD)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(sampling_rate=16000)
+        processor = ClvpProcessor.from_pretrained(
+            self.tmpdirname,
+            pad_token="(PAD)",
+            sampling_rate=16000,
+        )
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, ClvpTokenizer)
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, ClvpFeatureExtractor)
+    def test_model_input_names(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+        processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.assertListEqual(
+            sorted(processor.model_input_names),
+            sorted(set(feature_extractor.model_input_names + tokenizer.model_input_names)),
+            msg="`processor` and `feature_extractor` model input names do not match",
+        )

docs/transformers/tests/models/clvp/test_tokenization_clvp.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import unittest
+from functools import lru_cache
+from transformers import ClvpTokenizer
+from ...test_tokenization_common import TokenizerTesterMixin, slow, use_cache_if_possible
+class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    from_pretrained_id = "susnato/clvp_dev"
+    tokenizer_class = ClvpTokenizer
+    test_rust_tokenizer = False
+    from_pretrained_kwargs = {"add_prefix_space": True}
+    test_seq2seq = False
+    test_sentencepiece_ignore_case = True
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+            "<|endoftext|>",
+            "[SPACE]",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        cls.special_tokens_map = {"unk_token": "<unk>"}
+        cls.vocab_file = os.path.join(cls.tmpdirname, "vocab.json")
+        cls.merges_file = os.path.join(cls.tmpdirname, "merges.txt")
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return ClvpTokenizer.from_pretrained(pretrained_name, **kwargs)
+    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower[SPACE]newer"
+        return input_text, output_text
+    # Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens
+    def test_add_special_tokens(self):
+        tokenizers: list[ClvpTokenizer] = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                special_token = "[SPECIAL_TOKEN]"
+                special_token_box = [1000, 1000, 1000, 1000]
+                tokenizer.add_special_tokens({"cls_token": special_token})
+                encoded_special_token = tokenizer.encode(
+                    [special_token], boxes=[special_token_box], add_special_tokens=False
+                )
+                self.assertEqual(len(encoded_special_token), 1)
+                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
+                self.assertTrue(special_token not in decoded)
+    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_rust_and_python_full_tokenizers
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            self.skipTest(reason="test_rust_tokenizer is set to False")
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
+        sequence = "lower newer"
+        # Testing tokenization
+        tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+        # Testing conversion to ids without special tokens
+        ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+        # Testing conversion to ids with special tokens
+        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
+        ids = tokenizer.encode(sequence, add_prefix_space=True)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+        # Testing the unknown token
+        input_tokens = tokens + [rust_tokenizer.unk_token]
+        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_padding
+    def test_padding(self, max_length=15):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                # Simple input
+                s = "This is a simple input"
+                s2 = ["This is a simple input 1", "This is a simple input 2"]
+                p = ("This is a simple input", "This is a pair")
+                p2 = [
+                    ("This is a simple input 1", "This is a simple input 2"),
+                    ("This is a simple pair 1", "This is a simple pair 2"),
+                ]
+                # Simple input tests
+                self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
+                # Simple input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
+                # Simple input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    s2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
+                # Pair input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    p2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_padding_if_pad_token_set_slow
+    def test_padding_if_pad_token_set_slow(self):
+        tokenizer = ClvpTokenizer.from_pretrained(self.tmpdirname, pad_token="<pad>")
+        # Simple input
+        s = "This is a simple input"
+        s2 = ["This is a simple input looooooooong", "This is a simple input"]
+        p = ("This is a simple input", "This is a pair")
+        p2 = [
+            ("This is a simple input loooooong", "This is a simple input"),
+            ("This is a simple pair loooooong", "This is a simple pair"),
+        ]
+        pad_token_id = tokenizer.pad_token_id
+        out_s = tokenizer(s, padding="max_length", max_length=30, return_tensors="np")
+        out_s2 = tokenizer(s2, padding=True, truncate=True, return_tensors="np")
+        out_p = tokenizer(*p, padding="max_length", max_length=60, return_tensors="np")
+        out_p2 = tokenizer(p2, padding=True, truncate=True, return_tensors="np")
+        # s
+        # test single string max_length padding
+        self.assertEqual(out_s["input_ids"].shape[-1], 30)
+        self.assertTrue(pad_token_id in out_s["input_ids"])
+        self.assertTrue(0 in out_s["attention_mask"])
+        # s2
+        # test automatic padding
+        self.assertEqual(out_s2["input_ids"].shape[-1], 33)
+        # long slice doesn't have padding
+        self.assertFalse(pad_token_id in out_s2["input_ids"][0])
+        self.assertFalse(0 in out_s2["attention_mask"][0])
+        # short slice does have padding
+        self.assertTrue(pad_token_id in out_s2["input_ids"][1])
+        self.assertTrue(0 in out_s2["attention_mask"][1])
+        # p
+        # test single pair max_length padding
+        self.assertEqual(out_p["input_ids"].shape[-1], 60)
+        self.assertTrue(pad_token_id in out_p["input_ids"])
+        self.assertTrue(0 in out_p["attention_mask"])
+        # p2
+        # test automatic padding pair
+        self.assertEqual(out_p2["input_ids"].shape[-1], 52)
+        # long slice pair doesn't have padding
+        self.assertFalse(pad_token_id in out_p2["input_ids"][0])
+        self.assertFalse(0 in out_p2["attention_mask"][0])
+        # short slice pair does have padding
+        self.assertTrue(pad_token_id in out_p2["input_ids"][1])
+        self.assertTrue(0 in out_p2["attention_mask"][1])
+    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_special_tokens_mask_input_pairs_and_bos_token
+    def test_special_tokens_mask_input_pairs_and_bos_token(self):
+        # TODO: change to self.get_tokenizers() when the fast version is implemented
+        tokenizers = [self.get_tokenizer(do_lower_case=False, add_bos_token=True)]
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                sequence_0 = "Encode this."
+                sequence_1 = "This one too please."
+                encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
+                encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False)
+                encoded_sequence_dict = tokenizer.encode_plus(
+                    sequence_0,
+                    sequence_1,
+                    add_special_tokens=True,
+                    return_special_tokens_mask=True,
+                )
+                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+                filtered_sequence = [
+                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
+                ]
+                filtered_sequence = [x for x in filtered_sequence if x is not None]
+                self.assertEqual(encoded_sequence, filtered_sequence)
+    def test_token_type_ids(self):
+        tokenizer = self.get_tokenizer()
+        seq_0 = "Test this method."
+        # We want to have sequence 0 and sequence 1 are tagged
+        # respectively with 0 and 1 token_ids
+        # (regardless of whether the model use token type ids)
+        # We use this assumption in the QA pipeline among other place
+        output = tokenizer(seq_0, return_token_type_ids=True, add_special_tokens=True)
+        self.assertIn(0, output["token_type_ids"])
+    def test_full_tokenizer(self):
+        tokenizer = ClvpTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower newer"
+        bpe_tokens = ["l", "o", "w", "er", "[SPACE]", "n", "e", "w", "er"]
+        tokens = tokenizer.tokenize(text, add_prefix_space=False)
+        self.assertListEqual(tokens, bpe_tokens)
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [0, 1, 2, 15, 21, 9, 3, 2, 15, 19]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+    @slow
+    def test_outputs_with_numbers(self):
+        text = "hello and this is an example text and I have $1000. my lucky number is 12345."
+        tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev")
+        # fmt: off
+        EXPECTED_OUTPUT = [62, 84, 28, 2, 53, 2,147, 2, 54, 2, 43, 2, 169, 122, 29, 64, 2, 136, 37, 33, 2, 53, 2, 22,
+                           2, 148, 2, 110, 2, 40, 206, 53, 2, 134, 84, 59, 32, 9, 2, 125, 2, 25, 34, 197, 38, 2, 27,
+                           231, 15, 44, 2, 54, 2, 33, 100, 25, 76, 2, 40, 206, 53, 7, 2, 40, 46, 18, 2, 21, 97, 17,
+                           219, 2, 87, 210, 8, 19, 22, 76, 9,
+                           ]
+        # fmt: on
+        self.assertListEqual(tokenizer.encode(text, add_special_tokens=False), EXPECTED_OUTPUT)
+    @slow
+    def test_tokenizer_integration(self):
+        sequences = [
+            "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
+            "general-purpose architectures (BERT, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
+            "Language Understanding (NLU) and Natural Language Generation (NLG) with over multiple pretrained "
+            "models and deep interoperability between Jax, PyTorch and TensorFlow.",
+            "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
+            "conditioning on both left and right context in all layers.",
+            "The quick brown fox jumps over the lazy dog.",
+        ]
+        # fmt: off
+        expected_encoding = {'input_ids': [[144,  43,  32,  87,  26, 173,   2,   5,  87,  26,  44,  70,   2, 209, 27,   2,  55,   2,  29,  38,  51,  31,  71,   8, 144,  43,  32,  87, 26, 173,   2,  53,   2,  29,  38,  51,  31,  71,   8,  29,  46, 144, 137,  49,   8,  15,  44,  33,   6,   2, 187,  35,  83,  61,   2,  20, 50,  44,  56,   8,  29, 121, 139,  66,   2,  59,  71,  60,  18,  16, 33,  34, 175,   2,   5,  15,  44,  33,   7,   2,  89,  15,  44,  33, 14,   7,   2,  37,  25,  26,   7,   2,  17,  54,  78,  25,  15,  44, 33,   7,   2,  37,  25, 111,  33,   9,   9,   9,   6,   2,  87,   2, 27,  48, 121,  56,   2,  25,  43,  20,  34,  14, 112,   2,  97, 234, 63,  53,  52,   2,   5,  27,  25,  34,   6,   2,  53,   2,  27,  48, 121,  56,   2,  25,  43,  20,  34,  14, 112,   2,  20,  50,  44, 158, 2,   5,  27,  25,  20,   6,   2, 103,   2, 253,   2,  26, 167,  78, 29,  64,   2,  29,  46, 144, 137,  49,   2, 115, 126,  25,  32,   2, 53,   2, 126,  18,  29,   2,  41, 114, 161,  44, 109, 151, 240,   2, 67,  33, 100,  50,   2,  23,  14,  37,   7,   2,  29,  38,  51,  31, 71,   2,  53,   2,  33,  50,  32,  57,  19,  25,  69,   9], [ 15,  44,  33,   2,  54,   2,  17,  61,  22,  20,  27,  49,   2,  51, 2,  29,  46,   8, 144, 137,   2, 126,  18,  29,   2,  15,  83,  22, 46,  16, 181,  56,   2,  46,  29, 175,  86, 158,  32,   2, 154,   2, 97,  25,  14,  67,  25,  49,   2, 136,  37,  33,   2, 185,   2,  23, 28,  41,  33,  70,   2, 135,  17,  60, 107,  52,   2,  47,   2, 165, 40,   2,  64,  19,  33,   2,  53,   2, 101, 104,   2, 135, 136,  37, 33,   2,  41,   2, 108,   2,  25,  88, 173,   9,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0], [ 42,   2, 194,  91,  24,   2, 243, 190,   2, 182,  37,   2,  23, 231, 29,  32,   2, 253,   2,  42,   2,  25,  14,  39,  38,   2, 134,  20, 9,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]], # noqa: E501
+                             'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # noqa: E501
+                             }
+        # fmt: on
+        self.tokenizer_integration_test_util(
+            sequences=sequences, expected_encoding=expected_encoding, model_name="susnato/clvp_dev", padding=True
+        )

docs/transformers/tests/models/code_llama/__init__.py ADDED Viewed

File without changes

docs/transformers/tests/models/code_llama/test_tokenization_code_llama.py ADDED Viewed

	@@ -0,0 +1,653 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import pickle
+import shutil
+import tempfile
+import unittest
+from datasets import load_dataset
+from transformers import (
+    SPIECE_UNDERLINE,
+    AddedToken,
+    CodeLlamaTokenizer,
+    CodeLlamaTokenizerFast,
+)
+from transformers.convert_slow_tokenizer import convert_slow_tokenizer
+from transformers.testing_utils import (
+    get_tests_dir,
+    nested_simplify,
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    slow,
+)
+from ...test_tokenization_common import TokenizerTesterMixin
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+@require_sentencepiece
+@require_tokenizers
+class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    from_pretrained_id = "hf-internal-testing/llama-code-tokenizer"
+    tokenizer_class = CodeLlamaTokenizer
+    rust_tokenizer_class = CodeLlamaTokenizerFast
+    test_rust_tokenizer = False
+    test_sentencepiece = True
+    from_pretrained_kwargs = {}
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        # We have a SentencePiece fixture for testing
+        tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.save_pretrained(cls.tmpdirname)
+    def get_tokenizers(cls, **kwargs):
+        kwargs.update({"pad_token": "<PAD>"})
+        return super().get_tokenizers(**kwargs)
+    def test_no_infilling_init(self):
+        tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, prefix_token=None, keep_accents=True)
+        with self.assertRaises(ValueError):
+            tokenizer.tokenize("This is <FILL_ME> prefix")
+    def test_full_tokenizer(self):
+        tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [285, 46, 10, 170, 382],
+        )
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "9",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "é",
+                ".",
+            ],
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids,
+            [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
+        )
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            [
+                SPIECE_UNDERLINE + "I",
+                SPIECE_UNDERLINE + "was",
+                SPIECE_UNDERLINE + "b",
+                "or",
+                "n",
+                SPIECE_UNDERLINE + "in",
+                SPIECE_UNDERLINE + "",
+                "<unk>",
+                "2",
+                "0",
+                "0",
+                "0",
+                ",",
+                SPIECE_UNDERLINE + "and",
+                SPIECE_UNDERLINE + "this",
+                SPIECE_UNDERLINE + "is",
+                SPIECE_UNDERLINE + "f",
+                "al",
+                "s",
+                "<unk>",
+                ".",
+            ],
+        )
+    def test_save_pretrained(self):
+        self.tokenizers_list = [
+            (self.rust_tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
+            (self.tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
+            (self.tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}),
+            (self.rust_tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}),
+        ]
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
+                tmpdirname2 = tempfile.mkdtemp()
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+                # Checks it save with the same files + the tokenizer.json file for the fast one
+                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
+                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
+                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+                shutil.rmtree(tmpdirname2)
+                # Save tokenizer rust, legacy_format=True
+                tmpdirname2 = tempfile.mkdtemp()
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+                # Checks it save with the same files
+                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+                shutil.rmtree(tmpdirname2)
+                # Save tokenizer rust, legacy_format=False
+                tmpdirname2 = tempfile.mkdtemp()
+                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
+                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
+                # Checks it saved the tokenizer.json file
+                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
+                # Checks everything loads correctly in the same way
+                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
+                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
+                # Check special tokens are set accordingly on Rust and Python
+                for key in tokenizer_pp.special_tokens_map:
+                    self.assertTrue(hasattr(tokenizer_rp, key))
+                shutil.rmtree(tmpdirname2)
+    @require_torch
+    def test_batch_tokenization(self):
+        if not self.test_seq2seq:
+            self.skipTest(reason="test_seq2seq is False")
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Longer text that will definitely require truncation.
+                text = [
+                    " UN Chief Says There Is No Military Solution in Syria",
+                    " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for"
+                    " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons"
+                    " will only worsen the violence and misery for millions of people.",
+                ]
+                try:
+                    batch = tokenizer(
+                        text=text,
+                        max_length=3,
+                        max_target_length=10,
+                        return_tensors="pt",
+                    )
+                except NotImplementedError:
+                    self.skipTest(reason="Encountered NotImplementedError when calling tokenizer")
+                self.assertEqual(batch.input_ids.shape[1], 3)
+                # max_target_length will default to max_length if not specified
+                batch = tokenizer(text, max_length=3, return_tensors="pt")
+                self.assertEqual(batch.input_ids.shape[1], 3)
+                batch_encoder_only = tokenizer(text=text, max_length=3, max_target_length=10, return_tensors="pt")
+                self.assertEqual(batch_encoder_only.input_ids.shape[1], 3)
+                self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
+                self.assertNotIn("decoder_input_ids", batch_encoder_only)
+    @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.")
+    def test_save_slow_from_fast_and_reload_fast(self):
+        pass
+    def test_special_tokens_initialization(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                added_tokens = [AddedToken("<special>", lstrip=True)]
+                tokenizer_r = self.get_rust_tokenizer(
+                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
+                )
+                r_output = tokenizer_r.encode("Hey this is a <special> token")
+                special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
+                self.assertTrue(special_token_id in r_output)
+                if self.test_slow_tokenizer:
+                    tokenizer_cr = self.get_rust_tokenizer(
+                        pretrained_name,
+                        additional_special_tokens=added_tokens,
+                        **kwargs,  # , from_slow=True <- unfortunately too slow to convert
+                    )
+                    tokenizer_p = self.tokenizer_class.from_pretrained(
+                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
+                    )
+                    p_output = tokenizer_p.encode("Hey this is a <special> token")
+                    cr_output = tokenizer_cr.encode("Hey this is a <special> token")
+                    self.assertEqual(p_output, r_output)
+                    self.assertEqual(cr_output, r_output)
+                    self.assertTrue(special_token_id in p_output)
+                    self.assertTrue(special_token_id in cr_output)
+    @slow
+    def test_tokenizer_integration(self):
+        expected_encoding = {'input_ids': [[1, 4103, 689, 414, 313, 24784, 368, 2998, 408, 282, 3637, 25350, 29899, 9067, 414, 322, 282, 3637, 25350, 29899, 1457, 3018, 1312, 29899, 2151, 29897, 8128, 2498, 29899, 15503, 4220, 6956, 1973, 313, 13635, 29911, 29892, 402, 7982, 29899, 29906, 29892, 1528, 13635, 29911, 29874, 29892, 1060, 26369, 29892, 6652, 309, 29933, 814, 29892, 1060, 29931, 6779, 11410, 363, 18385, 17088, 7634, 11235, 313, 25103, 29965, 29897, 322, 18385, 17088, 28203, 313, 25103, 29954, 29897, 411, 975, 29871, 29941, 29906, 29974, 758, 3018, 1312, 4733, 297, 29871, 29896, 29900, 29900, 29974, 10276, 322, 6483, 1006, 3372, 3097, 1546, 435, 1165, 29892, 10772, 29911, 25350, 322, 323, 6073, 17907, 29889], [1, 350, 20161, 338, 8688, 304, 758, 29899, 14968, 6483, 21000, 8684, 284, 22540, 515, 443, 29880, 24025, 1426, 491, 14002, 368, 4195, 292, 373, 1716, 2175, 322, 1492, 3030, 297, 599, 15359, 29889], [1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 29889]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="hf-internal-testing/llama-code-tokenizer",
+            revision="6eb30c03ab6a9e2cdef4d523024909ec815ddb75",
+            padding=False,
+        )
+    def test_picklable(self):
+        with tempfile.NamedTemporaryFile() as f:
+            shutil.copyfile(SAMPLE_VOCAB, f.name)
+            tokenizer = CodeLlamaTokenizer(f.name, keep_accents=True)
+            pickled_tokenizer = pickle.dumps(tokenizer)
+        pickle.loads(pickled_tokenizer)
+    @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
+    def test_pickle_subword_regularization_tokenizer(self):
+        pass
+    @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
+    def test_subword_regularization_tokenizer(self):
+        pass
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class LlamaIntegrationTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        checkpoint_name = "hf-internal-testing/llama-code-tokenizer"
+        cls.tokenizer: CodeLlamaTokenizer = CodeLlamaTokenizer.from_pretrained(checkpoint_name)
+        cls.rust_tokenizer = CodeLlamaTokenizerFast.from_pretrained(checkpoint_name)
+        return cls
+    @require_torch
+    def integration_tests(self):
+        inputs = self.tokenizer(
+            ["The following string should be properly encoded: Hello.", "But ird and ปี   ird   ด"],
+            return_tensors="pt",
+        )
+        self.assertEqual(
+            nested_simplify(inputs),
+            {
+                "input_ids": [
+                    [1, 450, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889],
+                    [1, 1205, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718],
+                ],
+                "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
+            },
+        )
+    def test_fast_special_tokens(self):
+        slow_tokenizer = self.tokenizer
+        fast_tokenizer = self.rust_tokenizer
+        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert slow == [1, 319, 4559, 1243]
+        fast_tokenizer.add_eos_token = False
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [1, 319, 4559, 1243]
+        fast_tokenizer.add_eos_token = True
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [1, 319, 4559, 1243, 2]
+        slow_tokenizer.add_eos_token = True
+        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert slow == [1, 319, 4559, 1243, 2]
+        fast_tokenizer = CodeLlamaTokenizerFast.from_pretrained(
+            "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
+        )
+        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert fast == [319, 4559, 1243, 2]
+        slow_tokenizer = CodeLlamaTokenizer.from_pretrained(
+            "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
+        )
+        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
+        assert slow == [319, 4559, 1243, 2]
+        self.tokenizer.add_eos_token = False
+        self.rust_tokenizer.add_eos_token = False
+    @slow
+    def test_conversion(self):
+        # This is excruciatingly slow since it has to recreate the entire merge
+        # list from the original vocabulary in spm
+        self.rust_tokenizer.save_pretrained("./out")
+        with tempfile.TemporaryDirectory() as dirname:
+            self.rust_tokenizer.save_pretrained(dirname)
+            with open(os.path.join(dirname, "tokenizer.json")) as f:
+                old_serialized = f.read()
+        new_tokenizer = convert_slow_tokenizer(self.tokenizer)
+        with tempfile.NamedTemporaryFile() as f:
+            new_tokenizer.save(f.name)
+            # Re-opening since `f` is in bytes.
+            new_serialized = open(f.name).read()
+            with open("out_tokenizer.json", "w") as g:
+                g.write(new_serialized)
+            self.assertEqual(old_serialized, new_serialized)
+    def test_simple_encode_decode(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+        self.assertEqual(pyth_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243])
+        self.assertEqual(rust_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243])
+        self.assertEqual(pyth_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test")
+        self.assertEqual(rust_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test")
+        # bytefallback showcase
+        self.assertEqual(pyth_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392])  # fmt: skip
+        self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392])  # fmt: skip
+        self.assertEqual(
+            pyth_tokenizer.decode(
+                [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True
+            ),
+            "生活的真谛是",
+        )
+        self.assertEqual(
+            rust_tokenizer.decode(
+                [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True
+            ),
+            "生活的真谛是",
+        )
+        # Inner spaces showcase
+        self.assertEqual(pyth_tokenizer.encode("Hi  Hello"), [1, 6324, 29871, 15043])
+        self.assertEqual(rust_tokenizer.encode("Hi  Hello"), [1, 6324, 29871, 15043])
+        self.assertEqual(pyth_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi  Hello")
+        self.assertEqual(rust_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi  Hello")
+        self.assertEqual(pyth_tokenizer.encode("Hi   Hello"), [1, 6324, 259, 15043])
+        self.assertEqual(rust_tokenizer.encode("Hi   Hello"), [1, 6324, 259, 15043])
+        self.assertEqual(pyth_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi   Hello")
+        self.assertEqual(rust_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi   Hello")
+        self.assertEqual(pyth_tokenizer.encode(""), [1])
+        self.assertEqual(rust_tokenizer.encode(""), [1])
+        self.assertEqual(pyth_tokenizer.encode(" "), [1, 259])
+        self.assertEqual(rust_tokenizer.encode(" "), [1, 259])
+        self.assertEqual(pyth_tokenizer.encode("  "), [1, 1678])
+        self.assertEqual(rust_tokenizer.encode("  "), [1, 1678])
+        self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043])
+        self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043])
+    def test_no_differences_showcase(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+        self.assertEqual(pyth_tokenizer.encode(""), [1])
+        self.assertEqual(rust_tokenizer.encode(""), [1])
+        self.assertEqual(pyth_tokenizer.encode(" "), [1, 259])
+        self.assertEqual(rust_tokenizer.encode(" "), [1, 259])
+        self.assertEqual(pyth_tokenizer.encode("  "), [1, 1678])
+        self.assertEqual(rust_tokenizer.encode("  "), [1, 1678])
+        self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043])
+        self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043])
+        self.assertEqual(pyth_tokenizer.encode("<s>"), [1, 1])
+        self.assertEqual(rust_tokenizer.encode("<s>"), [1, 1])
+    def test_no_differences_decode(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+        self.assertEqual(pyth_tokenizer.decode([869]), ".")
+        self.assertEqual(rust_tokenizer.decode([869]), ".")
+        self.assertEqual(pyth_tokenizer.decode([30112, 869]), "ا .")
+        self.assertEqual(rust_tokenizer.decode([30112, 869]), "ا .")
+    def test_no_differences_special_tokens(self):
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+        self.assertEqual(pyth_tokenizer.encode(""), [1])
+        self.assertEqual(rust_tokenizer.encode(""), [1])
+        self.assertEqual(pyth_tokenizer.encode("<s>"), [1, 1])
+        self.assertEqual(rust_tokenizer.encode("<s>"), [1, 1])
+    @unittest.skipIf(
+        os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0",
+        "RUN_TOKENIZER_INTEGRATION=1 to run tokenizer integration tests",
+    )
+    def test_integration_test_xnli(self):
+        import tqdm
+        pyth_tokenizer = self.tokenizer
+        rust_tokenizer = self.rust_tokenizer
+        dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go")
+        for item in tqdm.tqdm(dataset["validation"]):
+            string = item["code"]
+            encoded1 = pyth_tokenizer.encode(string)
+            encoded2 = rust_tokenizer.encode(string)
+            self.assertEqual(encoded1, encoded2)
+            decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
+            decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
+            self.assertEqual(decoded1, decoded2)
+        dataset = load_dataset("facebook/xnli", "all_languages")
+        for item in tqdm.tqdm(dataset["train"]):
+            for string in item["premise"].values():
+                encoded1 = pyth_tokenizer.encode(string)
+                encoded2 = rust_tokenizer.encode(string)
+                self.assertEqual(encoded1, encoded2)
+                decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
+                decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
+                self.assertEqual(decoded1, decoded2)
+    def test_special_token_special_word(self):
+        # the word inform should be split as ['in', 'form']
+        tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", legacy=False)
+        tokenizer.add_tokens([AddedToken("<REPR_END>", rstrip=True, lstrip=True)], special_tokens=False)
+        out1 = tokenizer.decode(
+            tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=False
+        )
+        self.assertEqual(out1, "<REPR_END>inform")
+        out2 = tokenizer.decode(
+            tokenizer.encode("<REPR_END>inform", add_special_tokens=False), spaces_between_special_tokens=True
+        )
+        # the added prefix token should not be decoded
+        self.assertEqual(out2, "<REPR_END> inform")
+        input_ids = tokenizer.encode("<REPR_END>inform", add_special_tokens=False)
+        self.assertEqual(input_ids, [29871, 32016, 262, 689])  # 29871 is the spiece underline, '▁'
+        out2 = tokenizer.decode(
+            tokenizer.encode(" <REPR_END> inform", add_special_tokens=False), spaces_between_special_tokens=False
+        )
+        # TODO @ArthurZ currently we strip left and right, so this will not keep the spaces
+        self.assertEqual(out2, "<REPR_END>inform")
+        ### Let's make sure decoding does not add extra spaces here and there
+        # TODO @ArthurZ this should be affected by the lstrip/rstrip/single word /normalize refactoring
+        # Since currently we always strip left and right of the token, results are as such
+        input_ids = tokenizer.encode("<s> Hello<s>how", add_special_tokens=False)
+        self.assertEqual(input_ids, [1, 15043, 1, 3525])
+        tokens = tokenizer.tokenize("<s> Hello<s>how", add_special_tokens=False)
+        self.assertEqual(tokens, ["<s>", "▁Hello", "<s>", "how"])
+        decoded_tokens = tokenizer.decode(input_ids)
+        self.assertEqual(decoded_tokens, "<s> Hello<s>how")
+        # Let's make sure that if there are any spaces, we don't remove them!
+        input_ids = tokenizer.encode(" <s> Hello<s> how", add_special_tokens=False)
+        self.assertEqual(input_ids, [259, 1, 15043, 1, 920])
+        tokens = tokenizer.tokenize(" <s> Hello<s> how", add_special_tokens=False)
+        self.assertEqual(tokens, ["▁▁", "<s>", "▁Hello", "<s>", "▁how"])
+        decoded_tokens = tokenizer.decode(input_ids)
+        self.assertEqual(decoded_tokens, " <s> Hello<s> how")
+    def test_fill_token(self):
+        tokenizer = CodeLlamaTokenizerFast.from_pretrained(
+            "codellama/CodeLlama-7b-hf", fill_token=None, prefix_token=None, suffix_token=None, middle_token=None
+        )
+        tokenizer.encode_plus("Hey how are you").input_ids
+        tokenizer.fill_token = "<FILL_ME>"
+        with self.assertRaises(ValueError):
+            tokenizer.encode("Hey how <FILL_ME> are you")
+            tokenizer.encode_plus("Hey how <FILL_ME> are you", "mne too")
+            tokenizer.tokenize("Hey how are you", "mne too")
+        tokenizer = CodeLlamaTokenizerFast.from_pretrained(
+            "codellama/CodeLlama-7b-hf", revision="3773f63b4511b9e47a9a7ffc765eed7eb0169486"
+        )
+        tokenizer.encode("Hey how <FILL_ME> are you")
+        tokenizer.encode_plus("Hey how <FILL_ME> are you", "mne too")
+        tokenizer.tokenize("Hey how are you", "mne too")
+    def test_spm_edge_cases(self):
+        # the word inform should be split as ['in', 'form']
+        tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", legacy=False)
+        tokens = tokenizer.tokenize("[INST] How are you doing?<s>[/INST]")
+        self.assertEqual(
+            tokens, ["▁[", "INST", "]", "▁How", "▁are", "▁you", "▁doing", "?", "<s>", "[", "/", "INST", "]"]
+        )
+        inputs_ids = tokenizer.encode("[INST] How are you doing?<s>[/INST]")
+        self.assertEqual(
+            inputs_ids, [1, 518, 25580, 29962, 1128, 526, 366, 2599, 29973, 1, 29961, 29914, 25580, 29962]
+        )
+    def test_infilling_tokenization(self):
+        PROMPTS = [
+            '''def remove_non_ascii(s: str) -> str:
+    """ <FILL_ME>
+    return result
+''',
+            """# Installation instructions:
+    ```bash
+<FILL_ME>
+    ```
+This downloads the LLaMA inference code and installs the repository as a local pip package.
+""",
+            """class InterfaceManagerFactory(AbstractManagerFactory):
+    def __init__(<FILL_ME>
+def main():
+    factory = InterfaceManagerFactory(start=datetime.now())
+    managers = []
+    for i in range(10):
+        managers.append(factory.build(id=i))
+""",
+            """/-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/
+theorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :
+π₁ P = 0 ↔ <FILL_ME> = 0 :=
+begin
+split,
+{ intros h f,
+    rw pi_1_etalisation at h,
+    simp [h],
+    refl
+},
+{ intro h,
+    have := @quasi_adjoint C D P,
+    simp [←pi_1_etalisation, this, h],
+    refl
+}
+end
+""",
+        ]
+        tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
+        tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
+        formatted_prompt = tokenizer.tokenize(PROMPTS[0])
+        self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0]))
+        prefix, suffix = PROMPTS[0].split("<FILL_ME>")
+        self.assertEqual(formatted_prompt, tokenizer.tokenize(prefix, suffix))
+        self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(prefix, suffix))
+        input_ids = tokenizer.encode(PROMPTS[0], add_special_tokens=False)
+        self.assertEqual(input_ids, tokenizer_fast.encode(PROMPTS[0], add_special_tokens=False))
+        prefix, suffix = PROMPTS[0].split("<FILL_ME>")
+        input_ids = tokenizer.encode(PROMPTS[0])
+        self.assertEqual(input_ids, tokenizer.encode(prefix, suffix=suffix))
+        self.assertEqual(tokenizer.encode(prefix, suffix=suffix), tokenizer_fast.encode(prefix, suffix=suffix))
+        # Adding suffix_first check for infilling tasks
+        suffix_first_formatted_prompt = tokenizer.tokenize(PROMPTS[0], suffix_first=True)
+        self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0], suffix_first=True))
+        prefix, suffix = PROMPTS[0].split("<FILL_ME>")
+        self.assertEqual(suffix_first_formatted_prompt, tokenizer.tokenize(prefix, suffix, suffix_first=True))
+        self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(prefix, suffix, suffix_first=True))
+        prefix, suffix = PROMPTS[0].split("<FILL_ME>")
+        suffix_first_input_ids = tokenizer.encode(PROMPTS[0], suffix_first=True)
+        self.assertEqual(suffix_first_input_ids, tokenizer.encode(prefix, suffix=suffix, suffix_first=True))
+        self.assertEqual(suffix_first_input_ids, tokenizer_fast.encode(prefix, suffix=suffix, suffix_first=True))

docs/transformers/tests/models/codegen/__init__.py ADDED Viewed

File without changes

docs/transformers/tests/models/codegen/test_modeling_codegen.py ADDED Viewed

	@@ -0,0 +1,492 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from transformers import CodeGenConfig, is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import backend_manual_seed, require_torch, slow, torch_device
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+if is_torch_available():
+    import torch
+    from transformers import AutoTokenizer, CodeGenForCausalLM, CodeGenModel
+class CodeGenModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        is_training=True,
+        use_token_type_ids=True,
+        use_input_mask=True,
+        use_labels=True,
+        use_mc_token_ids=True,
+        vocab_size=256,
+        hidden_size=32,
+        rotary_dim=4,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_token_type_ids = use_token_type_ids
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.use_mc_token_ids = use_mc_token_ids
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.rotary_dim = rotary_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = None
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+        self.pad_token_id = vocab_size - 1
+    def get_large_model_config(self):
+        return CodeGenConfig.from_pretrained("Salesforce/codegen-2B-mono")
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+        mc_token_ids = None
+        if self.use_mc_token_ids:
+            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+        config = self.get_config()
+        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+        return (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+    def get_config(self):
+        return CodeGenConfig(
+            vocab_size=self.vocab_size,
+            n_embd=self.hidden_size,
+            n_layer=self.num_hidden_layers,
+            n_head=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            n_positions=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            use_cache=True,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            rotary_dim=self.rotary_dim,
+        )
+    def create_and_check_codegen_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = CodeGenModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
+    def create_and_check_codegen_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = CodeGenModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        # first forward pass
+        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
+        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
+        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
+        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+        output, past = outputs.to_tuple()
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+        # append to next input_ids and token_type_ids
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
+        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
+            "last_hidden_state"
+        ]
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+    def create_and_check_codegen_model_attention_mask_past(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        model = CodeGenModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        # create attention mask
+        attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+        half_seq_length = self.seq_length // 2
+        attn_mask[:, half_seq_length:] = 0
+        # first forward pass
+        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+        # change a random masked slice from input_ids
+        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+        # append to next input_ids and attn_mask
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        attn_mask = torch.cat(
+            [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+            dim=1,
+        )
+        # get two different outputs
+        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+    def create_and_check_codegen_model_past_large_inputs(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+    ):
+        model = CodeGenModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        # first forward pass
+        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
+        output, past = outputs.to_tuple()
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+        # append to next input_ids and token_type_ids
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+        output_from_no_past = model(
+            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
+        )["last_hidden_state"]
+        output_from_past = model(
+            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
+        )["last_hidden_state"]
+        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+        model = CodeGenForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+    def create_and_check_forward_and_backwards(
+        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
+    ):
+        model = CodeGenForCausalLM(config)
+        if gradient_checkpointing:
+            model.gradient_checkpointing_enable()
+        model.to(torch_device)
+        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+        self.parent.assertEqual(result.loss.shape, ())
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        result.loss.backward()
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            input_mask,
+            head_mask,
+            token_type_ids,
+            mc_token_ids,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
+        return config, inputs_dict
+@require_torch
+class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (CodeGenModel, CodeGenForCausalLM) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": CodeGenModel, "text-generation": CodeGenForCausalLM} if is_torch_available() else {}
+    )
+    fx_compatible = False
+    test_pruning = False
+    test_missing_keys = False
+    test_model_parallel = False
+    test_head_masking = False
+    # special case for DoubleHeads model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+        return inputs_dict
+    def setUp(self):
+        self.model_tester = CodeGenModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=CodeGenConfig, n_embd=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_codegen_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_codegen_model(*config_and_inputs)
+    def test_codegen_model_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_codegen_model_past(*config_and_inputs)
+    def test_codegen_model_att_mask_past(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_codegen_model_attention_mask_past(*config_and_inputs)
+    def test_codegen_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_codegen_model_past_large_inputs(*config_and_inputs)
+    def test_codegen_lm_head_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+    def test_codegen_gradient_checkpointing(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
+    @slow
+    def test_batch_generation(self):
+        tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
+        model = CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
+        model.to(torch_device)
+        tokenizer.padding_side = "left"
+        # Define PAD Token = EOS Token = 50256
+        tokenizer.pad_token = tokenizer.eos_token
+        model.config.pad_token_id = model.config.eos_token_id
+        # use different length sentences to test batching
+        sentences = ["def hellow_world():", "def greet(name):"]
+        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+        input_ids = inputs["input_ids"].to(torch_device)
+        token_type_ids = torch.cat(
+            [
+                input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0),
+                input_ids.new_full((input_ids.shape[0], 1), 500),
+            ],
+            dim=-1,
+        )
+        outputs = model.generate(
+            input_ids=input_ids,
+            attention_mask=inputs["attention_mask"].to(torch_device),
+        )
+        outputs_tt = model.generate(
+            input_ids=input_ids,
+            attention_mask=inputs["attention_mask"].to(torch_device),
+            token_type_ids=token_type_ids,
+        )
+        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
+        output_non_padded = model.generate(input_ids=inputs_non_padded)
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
+        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
+        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
+        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
+        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
+        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
+        expected_output_sentence = [
+            'def hellow_world():\n    print("Hello World")\n\nhellow_world()',
+            'def greet(name):\n    print(f"Hello {name}")\n\ng',
+        ]
+        self.assertListEqual(expected_output_sentence, batch_out_sentence)
+        self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
+        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "Salesforce/codegen-350M-nl"
+        model = CodeGenModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+@require_torch
+class CodeGenModelLanguageGenerationTest(unittest.TestCase):
+    @cached_property
+    def cached_tokenizer(self):
+        return AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
+    @cached_property
+    def cached_model(self):
+        return CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
+    @slow
+    def test_lm_generate_codegen(self):
+        tokenizer = self.cached_tokenizer
+        for checkpointing in [True, False]:
+            model = self.cached_model
+            if checkpointing:
+                model.gradient_checkpointing_enable()
+            else:
+                model.gradient_checkpointing_disable()
+            model.to(torch_device)
+            inputs = tokenizer("def hello_world():", return_tensors="pt").to(torch_device)
+            expected_output = 'def hello_world():\n    print("Hello World")\n\nhello_world()\n\n'
+            output_ids = model.generate(**inputs, do_sample=False)
+            output_str = tokenizer.batch_decode(output_ids)[0]
+            self.assertEqual(output_str, expected_output)
+    @slow
+    def test_codegen_sample(self):
+        tokenizer = self.cached_tokenizer
+        model = self.cached_model
+        model.to(torch_device)
+        torch.manual_seed(0)
+        backend_manual_seed(torch_device, 0)
+        tokenized = tokenizer("def hello_world():", return_tensors="pt", return_token_type_ids=True)
+        input_ids = tokenized.input_ids.to(torch_device)
+        output_ids = model.generate(input_ids, do_sample=True)
+        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        token_type_ids = tokenized.token_type_ids.to(torch_device)
+        output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5)
+        output_seq_tt = model.generate(
+            input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5
+        )
+        output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True)
+        output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True)
+        if torch_device == "cuda":
+            EXPECTED_OUTPUT_STR = 'def hello_world():\n    print("Hello World")\n    return True\n\nresult ='
+        else:
+            EXPECTED_OUTPUT_STR = "def hello_world():\r\n    print('Hello, World.')\r\n\r\n\r"
+        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
+        self.assertTrue(
+            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
+        )  # token_type_ids should change output

docs/transformers/tests/models/codegen/test_tokenization_codegen.py ADDED Viewed

	@@ -0,0 +1,329 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import re
+import unittest
+from functools import lru_cache
+from transformers import CodeGenTokenizer, CodeGenTokenizerFast
+from transformers.models.codegen.tokenization_codegen import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_tokenizers, slow
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
+@require_tokenizers
+class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    from_pretrained_id = "Salesforce/codegen-350M-mono"
+    tokenizer_class = CodeGenTokenizer
+    rust_tokenizer_class = CodeGenTokenizerFast
+    test_rust_tokenizer = True
+    from_pretrained_kwargs = {"add_prefix_space": True}
+    test_seq2seq = False
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "\u0120",
+            "\u0120l",
+            "\u0120n",
+            "\u0120lo",
+            "\u0120low",
+            "er",
+            "\u0120lowest",
+            "\u0120newer",
+            "\u0120wider",
+            "<unk>",
+            "<|endoftext|>",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        cls.special_tokens_map = {"unk_token": "<unk>"}
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return CodeGenTokenizer.from_pretrained(pretrained_name, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return CodeGenTokenizerFast.from_pretrained(pretrained_name, **kwargs)
+    def get_input_output_texts(self, tokenizer):
+        input_text = "lower newer"
+        output_text = "lower newer"
+        return input_text, output_text
+    def test_full_tokenizer(self):
+        tokenizer = CodeGenTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower newer"
+        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
+        tokens = tokenizer.tokenize(text, add_prefix_space=True)
+        self.assertListEqual(tokens, bpe_tokens)
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            self.skipTest(reason="test_rust_tokenizer is set to False")
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
+        sequence = "lower newer"
+        # Testing tokenization
+        tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+        # Testing conversion to ids without special tokens
+        ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+        # Testing conversion to ids with special tokens
+        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
+        ids = tokenizer.encode(sequence, add_prefix_space=True)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+        # Testing the unknown token
+        input_tokens = tokens + [rust_tokenizer.unk_token]
+        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+        self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+    @unittest.skip
+    def test_pretokenized_inputs(self, *args, **kwargs):
+        # It's very difficult to mix/test pretokenization with byte-level
+        # And get both CodeGen and Roberta to work at the same time (mostly an issue of adding a space before the string)
+        pass
+    def test_padding(self, max_length=15):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                # Simple input
+                s = "This is a simple input"
+                s2 = ["This is a simple input 1", "This is a simple input 2"]
+                p = ("This is a simple input", "This is a pair")
+                p2 = [
+                    ("This is a simple input 1", "This is a simple input 2"),
+                    ("This is a simple pair 1", "This is a simple pair 2"),
+                ]
+                # Simple input tests
+                self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
+                # Simple input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
+                # Simple input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    s2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
+                # Pair input
+                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
+                # Pair input
+                self.assertRaises(
+                    ValueError,
+                    tokenizer_r.batch_encode_plus,
+                    p2,
+                    max_length=max_length,
+                    padding="max_length",
+                )
+    def test_padding_if_pad_token_set_slow(self):
+        tokenizer = CodeGenTokenizer.from_pretrained(self.tmpdirname, pad_token="<pad>")
+        # Simple input
+        s = "This is a simple input"
+        s2 = ["This is a simple input looooooooong", "This is a simple input"]
+        p = ("This is a simple input", "This is a pair")
+        p2 = [
+            ("This is a simple input loooooong", "This is a simple input"),
+            ("This is a simple pair loooooong", "This is a simple pair"),
+        ]
+        pad_token_id = tokenizer.pad_token_id
+        out_s = tokenizer(s, padding="max_length", max_length=30, return_tensors="np")
+        out_s2 = tokenizer(s2, padding=True, truncate=True, return_tensors="np")
+        out_p = tokenizer(*p, padding="max_length", max_length=60, return_tensors="np")
+        out_p2 = tokenizer(p2, padding=True, truncate=True, return_tensors="np")
+        # s
+        # test single string max_length padding
+        self.assertEqual(out_s["input_ids"].shape[-1], 30)
+        self.assertTrue(pad_token_id in out_s["input_ids"])
+        self.assertTrue(0 in out_s["attention_mask"])
+        # s2
+        # test automatic padding
+        self.assertEqual(out_s2["input_ids"].shape[-1], 33)
+        # long slice doesn't have padding
+        self.assertFalse(pad_token_id in out_s2["input_ids"][0])
+        self.assertFalse(0 in out_s2["attention_mask"][0])
+        # short slice does have padding
+        self.assertTrue(pad_token_id in out_s2["input_ids"][1])
+        self.assertTrue(0 in out_s2["attention_mask"][1])
+        # p
+        # test single pair max_length padding
+        self.assertEqual(out_p["input_ids"].shape[-1], 60)
+        self.assertTrue(pad_token_id in out_p["input_ids"])
+        self.assertTrue(0 in out_p["attention_mask"])
+        # p2
+        # test automatic padding pair
+        self.assertEqual(out_p2["input_ids"].shape[-1], 52)
+        # long slice pair doesn't have padding
+        self.assertFalse(pad_token_id in out_p2["input_ids"][0])
+        self.assertFalse(0 in out_p2["attention_mask"][0])
+        # short slice pair does have padding
+        self.assertTrue(pad_token_id in out_p2["input_ids"][1])
+        self.assertTrue(0 in out_p2["attention_mask"][1])
+    def test_add_bos_token_slow(self):
+        bos_token = "$$$"
+        tokenizer = CodeGenTokenizer.from_pretrained(self.tmpdirname, bos_token=bos_token, add_bos_token=True)
+        s = "This is a simple input"
+        s2 = ["This is a simple input 1", "This is a simple input 2"]
+        bos_token_id = tokenizer.bos_token_id
+        out_s = tokenizer(s)
+        out_s2 = tokenizer(s2)
+        self.assertEqual(out_s.input_ids[0], bos_token_id)
+        self.assertTrue(all(o[0] == bos_token_id for o in out_s2.input_ids))
+        decode_s = tokenizer.decode(out_s.input_ids)
+        decode_s2 = tokenizer.batch_decode(out_s2.input_ids)
+        self.assertTrue(decode_s.startswith(bos_token))
+        self.assertTrue(all(d.startswith(bos_token) for d in decode_s2))
+    @slow
+    def test_truncation(self):
+        tokenizer = CodeGenTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
+        text = "\nif len_a > len_b:\n    result = a\nelse:\n    result = b\n\n\n\n#"
+        expected_truncated_text = "\nif len_a > len_b:\n      result = a\nelse:\n      result = b"
+        input_ids = tokenizer.encode(text)
+        truncation_pattern = ["^#", re.escape("<|endoftext|>"), "^'''", '^"""', "\n\n\n"]
+        decoded_text = tokenizer.decode(input_ids, truncate_before_pattern=truncation_pattern)
+        self.assertEqual(decoded_text, expected_truncated_text)
+        # TODO @ArthurZ outputs of the fast tokenizer are different in this case, un-related to the PR
+    # tokenizer has no padding token
+    @unittest.skip(reason="tokenizer has no padding token")
+    def test_padding_different_model_input_name(self):
+        pass
+    @slow
+    def test_tokenizer_integration(self):
+        # Custom test since this tokenizer takes return_token_type_ids as an init argument for backward compatibility.
+        sequences = [
+            "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
+            "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
+            "Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained "
+            "models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
+            "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
+            "conditioning on both left and right context in all layers.",
+            "The quick brown fox jumps over the lazy dog.",
+        ]
+        tokenizer_classes = [self.tokenizer_class]
+        if self.test_rust_tokenizer:
+            tokenizer_classes.append(self.rust_tokenizer_class)
+        # Test default case. i.e. return_token_type_ids is False.
+        for tokenizer_class in tokenizer_classes:
+            tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono")
+            encoding = tokenizer(sequences)
+            decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
+            # fmt: off
+            expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
+            # fmt: on
+            encoding_data = encoding.data
+            self.assertDictEqual(encoding_data, expected_encoding)
+            for expected, decoded in zip(sequences, decoded_sequences):
+                self.assertEqual(expected, decoded)
+        # Test return_token_type_ids is True case.
+        for tokenizer_class in tokenizer_classes:
+            tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono", return_token_type_ids=True)
+            encoding = tokenizer(sequences)
+            decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
+            # fmt: off
+            expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
+            # fmt: on
+            encoding_data = encoding.data
+            self.assertDictEqual(encoding_data, expected_encoding)
+            for expected, decoded in zip(sequences, decoded_sequences):
+                self.assertEqual(expected, decoded)

docs/transformers/tests/models/cohere/__init__.py ADDED Viewed

File without changes