Spaces:

Skywork
/

Unipic3

Running on Zero

App Files Files Community

OrlandoHugBot commited on 26 days ago

Commit

24ace1c

verified ·

1 Parent(s): fc3e76d

Update pipeline_qwenimage_edit.py

Browse files

Files changed (1) hide show

pipeline_qwenimage_edit.py +13 -15

pipeline_qwenimage_edit.py CHANGED Viewed

@@ -48,7 +48,6 @@ EXAMPLE_DOC_STRING = """
         >>> from PIL import Image
         >>> from diffusers import QwenImageEditPipeline
         >>> from diffusers.utils import load_image
         >>> pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=torch.bfloat16)
         >>> pipe.to("cuda")
         >>> image = load_image(
@@ -91,7 +90,6 @@ def retrieve_timesteps(
     r"""
     Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
     custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
     Args:
         scheduler (`SchedulerMixin`):
             The scheduler to get timesteps from.
@@ -106,7 +104,6 @@ def retrieve_timesteps(
         sigmas (`List[float]`, *optional*):
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
     Returns:
         `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
         second element is the number of inference steps.
@@ -177,7 +174,6 @@ def resize_to_multiple_of(image, multiple_of=32):
 class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
     r"""
     The Qwen-Image-Edit pipeline for image editing.
     Args:
         transformer ([`QwenImageTransformer2DModel`]):
             Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
@@ -244,7 +240,10 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
     ):
-        device = device or self._execution_device
         dtype = dtype or self.text_encoder.dtype
         prompts = [prompts] if isinstance(prompts, str) else prompts
@@ -277,14 +276,14 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
             )
             texts.append(text)
-        # Process inputs
         model_inputs = self.processor(
             text=texts,
             images=images,
             do_resize=False,   # already resized
             padding=True,
             return_tensors="pt"
-        ).to(self.device)
         # template = self.prompt_template_encode
         drop_idx = self.prompt_template_encode_start_idx
@@ -333,7 +332,6 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         max_sequence_length: int = 1024,
     ):
         r"""
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
@@ -347,7 +345,9 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                 provided, text embeddings will be generated from `prompt` input argument.
         """
-        device = device or self._execution_device
         prompt = [prompt] if isinstance(prompt, str) else prompt
         batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
@@ -601,7 +601,6 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
     ):
         r"""
         Function invoked when calling the pipeline for generation.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
@@ -629,7 +628,6 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                 of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                 `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                 the text `prompt`, usually at the expense of lower image quality.
                 This parameter in the pipeline is there to support future guidance-distilled models when they come up.
                 Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
                 please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
@@ -669,9 +667,7 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                 `._callback_tensor_inputs` attribute of your pipeline class.
             max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
         Examples:
         Returns:
             [`~pipelines.qwenimage.QwenImagePipelineOutput`] or `tuple`:
             [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
@@ -720,7 +716,9 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         else:
             batch_size = prompt_embeds.shape[0]
-        device = self._execution_device
         # 3. Preprocess image
         prompt_images = [image.resize((round(image.width * 28 / 32), round(image.height * 28 / 32))) for image in images]
@@ -907,4 +905,4 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         if not return_dict:
             return (image,)
-        return QwenImagePipelineOutput(images=image)

         >>> from PIL import Image
         >>> from diffusers import QwenImageEditPipeline
         >>> from diffusers.utils import load_image
         >>> pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=torch.bfloat16)
         >>> pipe.to("cuda")
         >>> image = load_image(
     r"""
     Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
     custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
     Args:
         scheduler (`SchedulerMixin`):
             The scheduler to get timesteps from.
         sigmas (`List[float]`, *optional*):
             Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
             `num_inference_steps` and `timesteps` must be `None`.
     Returns:
         `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
         second element is the number of inference steps.
 class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
     r"""
     The Qwen-Image-Edit pipeline for image editing.
     Args:
         transformer ([`QwenImageTransformer2DModel`]):
             Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
     ):
+        # 修复：优先使用 text_encoder 的实际设备，而不是 self.device
+        # 这确保在 ZeroGPU 环境下设备一致性
+        if device is None:
+            device = next(self.text_encoder.parameters()).device
         dtype = dtype or self.text_encoder.dtype
         prompts = [prompts] if isinstance(prompts, str) else prompts
             )
             texts.append(text)
+        # Process inputs - 修复：使用 text_encoder 的实际设备
         model_inputs = self.processor(
             text=texts,
             images=images,
             do_resize=False,   # already resized
             padding=True,
             return_tensors="pt"
+        ).to(device)
         # template = self.prompt_template_encode
         drop_idx = self.prompt_template_encode_start_idx
         max_sequence_length: int = 1024,
     ):
         r"""
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
                 Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                 provided, text embeddings will be generated from `prompt` input argument.
         """
+        # 修复：优先使用 text_encoder 的实际设备
+        if device is None:
+            device = next(self.text_encoder.parameters()).device
         prompt = [prompt] if isinstance(prompt, str) else prompt
         batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
     ):
         r"""
         Function invoked when calling the pipeline for generation.
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                 `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                 the text `prompt`, usually at the expense of lower image quality.
                 This parameter in the pipeline is there to support future guidance-distilled models when they come up.
                 Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
                 please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                 `._callback_tensor_inputs` attribute of your pipeline class.
             max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
         Examples:
         Returns:
             [`~pipelines.qwenimage.QwenImagePipelineOutput`] or `tuple`:
             [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
         else:
             batch_size = prompt_embeds.shape[0]
+        # 修复：使用 text_encoder 的实际设备，而不是 _execution_device
+        device = next(self.text_encoder.parameters()).device
         # 3. Preprocess image
         prompt_images = [image.resize((round(image.width * 28 / 32), round(image.height * 28 / 32))) for image in images]
         if not return_dict:
             return (image,)
+        return QwenImagePipelineOutput(images=image)