| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import unittest |
| |
|
| | import torch |
| |
|
| | from diffusers import AutoencoderKLCogVideoX |
| |
|
| | from ...testing_utils import ( |
| | enable_full_determinism, |
| | floats_tensor, |
| | torch_device, |
| | ) |
| | from ..test_modeling_common import ModelTesterMixin |
| | from .testing_utils import AutoencoderTesterMixin |
| |
|
| |
|
| | enable_full_determinism() |
| |
|
| |
|
| | class AutoencoderKLCogVideoXTests(ModelTesterMixin, AutoencoderTesterMixin, unittest.TestCase): |
| | model_class = AutoencoderKLCogVideoX |
| | main_input_name = "sample" |
| | base_precision = 1e-2 |
| |
|
| | def get_autoencoder_kl_cogvideox_config(self): |
| | return { |
| | "in_channels": 3, |
| | "out_channels": 3, |
| | "down_block_types": ( |
| | "CogVideoXDownBlock3D", |
| | "CogVideoXDownBlock3D", |
| | "CogVideoXDownBlock3D", |
| | "CogVideoXDownBlock3D", |
| | ), |
| | "up_block_types": ( |
| | "CogVideoXUpBlock3D", |
| | "CogVideoXUpBlock3D", |
| | "CogVideoXUpBlock3D", |
| | "CogVideoXUpBlock3D", |
| | ), |
| | "block_out_channels": (8, 8, 8, 8), |
| | "latent_channels": 4, |
| | "layers_per_block": 1, |
| | "norm_num_groups": 2, |
| | "temporal_compression_ratio": 4, |
| | } |
| |
|
| | @property |
| | def dummy_input(self): |
| | batch_size = 4 |
| | num_frames = 8 |
| | num_channels = 3 |
| | sizes = (16, 16) |
| |
|
| | image = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device) |
| |
|
| | return {"sample": image} |
| |
|
| | @property |
| | def input_shape(self): |
| | return (3, 8, 16, 16) |
| |
|
| | @property |
| | def output_shape(self): |
| | return (3, 8, 16, 16) |
| |
|
| | def prepare_init_args_and_inputs_for_common(self): |
| | init_dict = self.get_autoencoder_kl_cogvideox_config() |
| | inputs_dict = self.dummy_input |
| | return init_dict, inputs_dict |
| |
|
| | def test_gradient_checkpointing_is_applied(self): |
| | expected_set = { |
| | "CogVideoXDownBlock3D", |
| | "CogVideoXDecoder3D", |
| | "CogVideoXEncoder3D", |
| | "CogVideoXUpBlock3D", |
| | "CogVideoXMidBlock3D", |
| | } |
| | super().test_gradient_checkpointing_is_applied(expected_set=expected_set) |
| |
|
| | def test_forward_with_norm_groups(self): |
| | init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() |
| |
|
| | init_dict["norm_num_groups"] = 16 |
| | init_dict["block_out_channels"] = (16, 32, 32, 32) |
| |
|
| | model = self.model_class(**init_dict) |
| | model.to(torch_device) |
| | model.eval() |
| |
|
| | with torch.no_grad(): |
| | output = model(**inputs_dict) |
| |
|
| | if isinstance(output, dict): |
| | output = output.to_tuple()[0] |
| |
|
| | self.assertIsNotNone(output) |
| | expected_shape = inputs_dict["sample"].shape |
| | self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") |
| |
|
| | @unittest.skip("Unsupported test.") |
| | def test_outputs_equivalence(self): |
| | pass |
| |
|