Upload folder using huggingface_hub

ac2243f verified about 1 month ago

7.8 kB

	# Copyright 2025 The HuggingFace Team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import unittest

	import numpy as np
	import torch
	from transformers import AutoTokenizer, T5EncoderModel

	from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, GlmImagePipeline, GlmImageTransformer2DModel
	from diffusers.utils import is_transformers_version

	from ...testing_utils import enable_full_determinism, require_torch_accelerator, require_transformers_version_greater
	from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
	from ..test_pipelines_common import PipelineTesterMixin


	if is_transformers_version(">=", "5.0.0.dev0"):
	from transformers import GlmImageConfig, GlmImageForConditionalGeneration, GlmImageProcessor


	enable_full_determinism()


	@require_transformers_version_greater("4.57.4")
	@require_torch_accelerator
	class GlmImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
	pipeline_class = GlmImagePipeline
	params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs", "negative_prompt"}
	batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
	image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
	image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
	required_optional_params = frozenset(
	[
	"num_inference_steps",
	"generator",
	"latents",
	"return_dict",
	"callback_on_step_end",
	"callback_on_step_end_tensor_inputs",
	]
	)
	test_xformers_attention = False
	test_attention_slicing = False
	supports_dduf = False

	def get_dummy_components(self):
	torch.manual_seed(0)
	text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
	tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")

	glm_config = GlmImageConfig(
	text_config={
	"vocab_size": 168064,
	"hidden_size": 32,
	"intermediate_size": 32,
	"num_hidden_layers": 2,
	"num_attention_heads": 2,
	"num_key_value_heads": 2,
	"max_position_embeddings": 512,
	"vision_vocab_size": 128,
	"rope_parameters": {"mrope_section": (4, 2, 2)},
	},
	vision_config={
	"depth": 2,
	"hidden_size": 32,
	"num_heads": 2,
	"image_size": 32,
	"patch_size": 8,
	"intermediate_size": 32,
	},
	vq_config={"embed_dim": 32, "num_embeddings": 128, "latent_channels": 32},
	)

	torch.manual_seed(0)
	vision_language_encoder = GlmImageForConditionalGeneration(glm_config)

	processor = GlmImageProcessor.from_pretrained("zai-org/GLM-Image", subfolder="processor")

	torch.manual_seed(0)
	# For GLM-Image, the relationship between components must satisfy:
	# patch_size × vae_scale_factor = 16 (since AR tokens are upsampled 2× from d32)
	transformer = GlmImageTransformer2DModel(
	patch_size=2,
	in_channels=4,
	out_channels=4,
	num_layers=2,
	attention_head_dim=8,
	num_attention_heads=2,
	text_embed_dim=text_encoder.config.hidden_size,
	time_embed_dim=16,
	condition_dim=8,
	prior_vq_quantizer_codebook_size=128,
	)

	torch.manual_seed(0)
	vae = AutoencoderKL(
	block_out_channels=(4, 8, 16, 16),
	in_channels=3,
	out_channels=3,
	down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
	up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
	latent_channels=4,
	norm_num_groups=4,
	sample_size=128,
	latents_mean=[0.0] * 4,
	latents_std=[1.0] * 4,
	)

	scheduler = FlowMatchEulerDiscreteScheduler()

	components = {
	"tokenizer": tokenizer,
	"processor": processor,
	"text_encoder": text_encoder,
	"vision_language_encoder": vision_language_encoder,
	"vae": vae,
	"transformer": transformer,
	"scheduler": scheduler,
	}

	return components

	def get_dummy_inputs(self, device, seed=0):
	if str(device).startswith("mps"):
	generator = torch.manual_seed(seed)
	else:
	generator = torch.Generator(device=device).manual_seed(seed)

	height, width = 32, 32

	inputs = {
	"prompt": "A photo of a cat",
	"generator": generator,
	"num_inference_steps": 2,
	"guidance_scale": 1.5,
	"height": height,
	"width": width,
	"max_sequence_length": 16,
	"output_type": "pt",
	}

	return inputs

	def test_inference(self):
	device = "cpu"

	components = self.get_dummy_components()
	pipe = self.pipeline_class(**components)
	pipe.to(device)
	pipe.set_progress_bar_config(disable=None)

	inputs = self.get_dummy_inputs(device)
	image = pipe(**inputs).images[0]
	generated_slice = image.flatten()
	generated_slice = np.concatenate([generated_slice[:8], generated_slice[-8:]])

	# fmt: off
	expected_slice = np.array(
	[
	0.5796329, 0.5005878, 0.45881274, 0.45331675, 0.43688118, 0.4899527, 0.54017603, 0.50983673, 0.3387968, 0.38074082, 0.29942477, 0.33733928, 0.3672544, 0.38462338, 0.40991822, 0.46641728
	]
	)
	# fmt: on

	self.assertEqual(image.shape, (3, 32, 32))
	self.assertTrue(np.allclose(expected_slice, generated_slice, atol=1e-4, rtol=1e-4))

	@unittest.skip("Not supported.")
	def test_inference_batch_single_identical(self):
	# GLM-Image has batch_size=1 constraint due to AR model
	pass

	@unittest.skip("Not supported.")
	def test_inference_batch_consistent(self):
	# GLM-Image has batch_size=1 constraint due to AR model
	pass

	@unittest.skip("Not supported.")
	def test_num_images_per_prompt(self):
	# GLM-Image has batch_size=1 constraint due to AR model
	pass

	@unittest.skip("Needs to be revisited.")
	def test_encode_prompt_works_in_isolation(self):
	pass

	@unittest.skip("Needs to be revisited.")
	def test_pipeline_level_group_offloading_inference(self):
	pass

	@unittest.skip(
	"Follow set of tests are relaxed because this pipeline doesn't guarantee same outputs for the same inputs in consecutive runs."
	)
	def test_dict_tuple_outputs_equivalent(self):
	pass

	@unittest.skip("Skipped")
	def test_cpu_offload_forward_pass_twice(self):
	pass

	@unittest.skip("Skipped")
	def test_sequential_offload_forward_pass_twice(self):
	pass

	@unittest.skip("Skipped")
	def test_float16_inference(self):
	pass

	@unittest.skip("Skipped")
	def test_save_load_float16(self):
	pass

	@unittest.skip("Skipped")
	def test_save_load_local(self):
	pass