| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import gc |
| import random |
| import unittest |
|
|
| import numpy as np |
| import torch |
| from transformers import XLMRobertaTokenizer |
|
|
| from diffusers import ( |
| AltDiffusionImg2ImgPipeline, |
| AutoencoderKL, |
| PNDMScheduler, |
| UNet2DConditionModel, |
| ) |
| from diffusers.image_processor import VaeImageProcessor |
| from diffusers.pipelines.alt_diffusion.modeling_roberta_series import ( |
| RobertaSeriesConfig, |
| RobertaSeriesModelWithTransformation, |
| ) |
| from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device |
| from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu |
|
|
|
|
| enable_full_determinism() |
|
|
|
|
| class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase): |
| def tearDown(self): |
| |
| super().tearDown() |
| gc.collect() |
| torch.cuda.empty_cache() |
|
|
| @property |
| def dummy_image(self): |
| batch_size = 1 |
| num_channels = 3 |
| sizes = (32, 32) |
|
|
| image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) |
| return image |
|
|
| @property |
| def dummy_cond_unet(self): |
| torch.manual_seed(0) |
| model = UNet2DConditionModel( |
| block_out_channels=(32, 64), |
| layers_per_block=2, |
| sample_size=32, |
| in_channels=4, |
| out_channels=4, |
| down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), |
| up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), |
| cross_attention_dim=32, |
| ) |
| return model |
|
|
| @property |
| def dummy_vae(self): |
| torch.manual_seed(0) |
| model = AutoencoderKL( |
| block_out_channels=[32, 64], |
| in_channels=3, |
| out_channels=3, |
| down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], |
| up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], |
| latent_channels=4, |
| ) |
| return model |
|
|
| @property |
| def dummy_text_encoder(self): |
| torch.manual_seed(0) |
| config = RobertaSeriesConfig( |
| hidden_size=32, |
| project_dim=32, |
| intermediate_size=37, |
| layer_norm_eps=1e-05, |
| num_attention_heads=4, |
| num_hidden_layers=5, |
| pad_token_id=1, |
| vocab_size=5006, |
| ) |
| return RobertaSeriesModelWithTransformation(config) |
|
|
| @property |
| def dummy_extractor(self): |
| def extract(*args, **kwargs): |
| class Out: |
| def __init__(self): |
| self.pixel_values = torch.ones([0]) |
|
|
| def to(self, device): |
| self.pixel_values.to(device) |
| return self |
|
|
| return Out() |
|
|
| return extract |
|
|
| def test_stable_diffusion_img2img_default_case(self): |
| device = "cpu" |
| unet = self.dummy_cond_unet |
| scheduler = PNDMScheduler(skip_prk_steps=True) |
| vae = self.dummy_vae |
| bert = self.dummy_text_encoder |
| tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta") |
| tokenizer.model_max_length = 77 |
|
|
| init_image = self.dummy_image.to(device) |
| init_image = init_image / 2 + 0.5 |
|
|
| |
| alt_pipe = AltDiffusionImg2ImgPipeline( |
| unet=unet, |
| scheduler=scheduler, |
| vae=vae, |
| text_encoder=bert, |
| tokenizer=tokenizer, |
| safety_checker=None, |
| feature_extractor=self.dummy_extractor, |
| ) |
| alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=True) |
| alt_pipe = alt_pipe.to(device) |
| alt_pipe.set_progress_bar_config(disable=None) |
|
|
| prompt = "A painting of a squirrel eating a burger" |
| generator = torch.Generator(device=device).manual_seed(0) |
| output = alt_pipe( |
| [prompt], |
| generator=generator, |
| guidance_scale=6.0, |
| num_inference_steps=2, |
| output_type="np", |
| image=init_image, |
| ) |
|
|
| image = output.images |
|
|
| generator = torch.Generator(device=device).manual_seed(0) |
| image_from_tuple = alt_pipe( |
| [prompt], |
| generator=generator, |
| guidance_scale=6.0, |
| num_inference_steps=2, |
| output_type="np", |
| image=init_image, |
| return_dict=False, |
| )[0] |
|
|
| image_slice = image[0, -3:, -3:, -1] |
| image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] |
|
|
| assert image.shape == (1, 32, 32, 3) |
| expected_slice = np.array([0.4427, 0.3731, 0.4249, 0.4941, 0.4546, 0.4148, 0.4193, 0.4666, 0.4499]) |
|
|
| assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3 |
| assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-3 |
|
|
| @unittest.skipIf(torch_device != "cuda", "This test requires a GPU") |
| def test_stable_diffusion_img2img_fp16(self): |
| """Test that stable diffusion img2img works with fp16""" |
| unet = self.dummy_cond_unet |
| scheduler = PNDMScheduler(skip_prk_steps=True) |
| vae = self.dummy_vae |
| bert = self.dummy_text_encoder |
| tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta") |
| tokenizer.model_max_length = 77 |
|
|
| init_image = self.dummy_image.to(torch_device) |
|
|
| |
| unet = unet.half() |
| vae = vae.half() |
| bert = bert.half() |
|
|
| |
| alt_pipe = AltDiffusionImg2ImgPipeline( |
| unet=unet, |
| scheduler=scheduler, |
| vae=vae, |
| text_encoder=bert, |
| tokenizer=tokenizer, |
| safety_checker=None, |
| feature_extractor=self.dummy_extractor, |
| ) |
| alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False) |
| alt_pipe = alt_pipe.to(torch_device) |
| alt_pipe.set_progress_bar_config(disable=None) |
|
|
| prompt = "A painting of a squirrel eating a burger" |
| generator = torch.manual_seed(0) |
| image = alt_pipe( |
| [prompt], |
| generator=generator, |
| num_inference_steps=2, |
| output_type="np", |
| image=init_image, |
| ).images |
|
|
| assert image.shape == (1, 32, 32, 3) |
|
|
| @unittest.skipIf(torch_device != "cuda", "This test requires a GPU") |
| def test_stable_diffusion_img2img_pipeline_multiple_of_8(self): |
| init_image = load_image( |
| "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| "/img2img/sketch-mountains-input.jpg" |
| ) |
| |
| init_image = init_image.resize((760, 504)) |
|
|
| model_id = "BAAI/AltDiffusion" |
| pipe = AltDiffusionImg2ImgPipeline.from_pretrained( |
| model_id, |
| safety_checker=None, |
| ) |
| pipe.to(torch_device) |
| pipe.set_progress_bar_config(disable=None) |
| pipe.enable_attention_slicing() |
|
|
| prompt = "A fantasy landscape, trending on artstation" |
|
|
| generator = torch.manual_seed(0) |
| output = pipe( |
| prompt=prompt, |
| image=init_image, |
| strength=0.75, |
| guidance_scale=7.5, |
| generator=generator, |
| output_type="np", |
| ) |
| image = output.images[0] |
|
|
| image_slice = image[255:258, 383:386, -1] |
|
|
| assert image.shape == (504, 760, 3) |
| expected_slice = np.array([0.9358, 0.9397, 0.9599, 0.9901, 1.0000, 1.0000, 0.9882, 1.0000, 1.0000]) |
|
|
| assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 |
|
|
|
|
| @slow |
| @require_torch_gpu |
| class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase): |
| def tearDown(self): |
| |
| super().tearDown() |
| gc.collect() |
| torch.cuda.empty_cache() |
|
|
| def test_stable_diffusion_img2img_pipeline_default(self): |
| init_image = load_image( |
| "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" |
| "/img2img/sketch-mountains-input.jpg" |
| ) |
| init_image = init_image.resize((768, 512)) |
| expected_image = load_numpy( |
| "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy" |
| ) |
|
|
| model_id = "BAAI/AltDiffusion" |
| pipe = AltDiffusionImg2ImgPipeline.from_pretrained( |
| model_id, |
| safety_checker=None, |
| ) |
| pipe.to(torch_device) |
| pipe.set_progress_bar_config(disable=None) |
| pipe.enable_attention_slicing() |
|
|
| prompt = "A fantasy landscape, trending on artstation" |
|
|
| generator = torch.manual_seed(0) |
| output = pipe( |
| prompt=prompt, |
| image=init_image, |
| strength=0.75, |
| guidance_scale=7.5, |
| generator=generator, |
| output_type="np", |
| ) |
| image = output.images[0] |
|
|
| assert image.shape == (512, 768, 3) |
| |
| assert np.abs(expected_image - image).max() < 1e-2 |
|
|