Spaces:
Running
on
Zero
Running
on
Zero
Update pipeline_qwenimage_edit.py
Browse files- pipeline_qwenimage_edit.py +13 -15
pipeline_qwenimage_edit.py
CHANGED
|
@@ -48,7 +48,6 @@ EXAMPLE_DOC_STRING = """
|
|
| 48 |
>>> from PIL import Image
|
| 49 |
>>> from diffusers import QwenImageEditPipeline
|
| 50 |
>>> from diffusers.utils import load_image
|
| 51 |
-
|
| 52 |
>>> pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=torch.bfloat16)
|
| 53 |
>>> pipe.to("cuda")
|
| 54 |
>>> image = load_image(
|
|
@@ -91,7 +90,6 @@ def retrieve_timesteps(
|
|
| 91 |
r"""
|
| 92 |
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
| 93 |
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
| 94 |
-
|
| 95 |
Args:
|
| 96 |
scheduler (`SchedulerMixin`):
|
| 97 |
The scheduler to get timesteps from.
|
|
@@ -106,7 +104,6 @@ def retrieve_timesteps(
|
|
| 106 |
sigmas (`List[float]`, *optional*):
|
| 107 |
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
|
| 108 |
`num_inference_steps` and `timesteps` must be `None`.
|
| 109 |
-
|
| 110 |
Returns:
|
| 111 |
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
| 112 |
second element is the number of inference steps.
|
|
@@ -177,7 +174,6 @@ def resize_to_multiple_of(image, multiple_of=32):
|
|
| 177 |
class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
|
| 178 |
r"""
|
| 179 |
The Qwen-Image-Edit pipeline for image editing.
|
| 180 |
-
|
| 181 |
Args:
|
| 182 |
transformer ([`QwenImageTransformer2DModel`]):
|
| 183 |
Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
|
|
@@ -244,7 +240,10 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
|
|
| 244 |
device: Optional[torch.device] = None,
|
| 245 |
dtype: Optional[torch.dtype] = None,
|
| 246 |
):
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
| 248 |
dtype = dtype or self.text_encoder.dtype
|
| 249 |
|
| 250 |
prompts = [prompts] if isinstance(prompts, str) else prompts
|
|
@@ -277,14 +276,14 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
|
|
| 277 |
)
|
| 278 |
texts.append(text)
|
| 279 |
|
| 280 |
-
# Process inputs
|
| 281 |
model_inputs = self.processor(
|
| 282 |
text=texts,
|
| 283 |
images=images,
|
| 284 |
do_resize=False, # already resized
|
| 285 |
padding=True,
|
| 286 |
return_tensors="pt"
|
| 287 |
-
).to(
|
| 288 |
|
| 289 |
# template = self.prompt_template_encode
|
| 290 |
drop_idx = self.prompt_template_encode_start_idx
|
|
@@ -333,7 +332,6 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
|
|
| 333 |
max_sequence_length: int = 1024,
|
| 334 |
):
|
| 335 |
r"""
|
| 336 |
-
|
| 337 |
Args:
|
| 338 |
prompt (`str` or `List[str]`, *optional*):
|
| 339 |
prompt to be encoded
|
|
@@ -347,7 +345,9 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
|
|
| 347 |
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
| 348 |
provided, text embeddings will be generated from `prompt` input argument.
|
| 349 |
"""
|
| 350 |
-
|
|
|
|
|
|
|
| 351 |
|
| 352 |
prompt = [prompt] if isinstance(prompt, str) else prompt
|
| 353 |
batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
|
|
@@ -601,7 +601,6 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
|
|
| 601 |
):
|
| 602 |
r"""
|
| 603 |
Function invoked when calling the pipeline for generation.
|
| 604 |
-
|
| 605 |
Args:
|
| 606 |
prompt (`str` or `List[str]`, *optional*):
|
| 607 |
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
|
@@ -629,7 +628,6 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
|
|
| 629 |
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
| 630 |
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
| 631 |
the text `prompt`, usually at the expense of lower image quality.
|
| 632 |
-
|
| 633 |
This parameter in the pipeline is there to support future guidance-distilled models when they come up.
|
| 634 |
Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
|
| 635 |
please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
|
|
@@ -669,9 +667,7 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
|
|
| 669 |
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
| 670 |
`._callback_tensor_inputs` attribute of your pipeline class.
|
| 671 |
max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
|
| 672 |
-
|
| 673 |
Examples:
|
| 674 |
-
|
| 675 |
Returns:
|
| 676 |
[`~pipelines.qwenimage.QwenImagePipelineOutput`] or `tuple`:
|
| 677 |
[`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
|
|
@@ -720,7 +716,9 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
|
|
| 720 |
else:
|
| 721 |
batch_size = prompt_embeds.shape[0]
|
| 722 |
|
| 723 |
-
|
|
|
|
|
|
|
| 724 |
# 3. Preprocess image
|
| 725 |
|
| 726 |
prompt_images = [image.resize((round(image.width * 28 / 32), round(image.height * 28 / 32))) for image in images]
|
|
@@ -907,4 +905,4 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
|
|
| 907 |
if not return_dict:
|
| 908 |
return (image,)
|
| 909 |
|
| 910 |
-
return QwenImagePipelineOutput(images=image)
|
|
|
|
| 48 |
>>> from PIL import Image
|
| 49 |
>>> from diffusers import QwenImageEditPipeline
|
| 50 |
>>> from diffusers.utils import load_image
|
|
|
|
| 51 |
>>> pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=torch.bfloat16)
|
| 52 |
>>> pipe.to("cuda")
|
| 53 |
>>> image = load_image(
|
|
|
|
| 90 |
r"""
|
| 91 |
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
|
| 92 |
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
|
|
|
|
| 93 |
Args:
|
| 94 |
scheduler (`SchedulerMixin`):
|
| 95 |
The scheduler to get timesteps from.
|
|
|
|
| 104 |
sigmas (`List[float]`, *optional*):
|
| 105 |
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
|
| 106 |
`num_inference_steps` and `timesteps` must be `None`.
|
|
|
|
| 107 |
Returns:
|
| 108 |
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
|
| 109 |
second element is the number of inference steps.
|
|
|
|
| 174 |
class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
|
| 175 |
r"""
|
| 176 |
The Qwen-Image-Edit pipeline for image editing.
|
|
|
|
| 177 |
Args:
|
| 178 |
transformer ([`QwenImageTransformer2DModel`]):
|
| 179 |
Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
|
|
|
|
| 240 |
device: Optional[torch.device] = None,
|
| 241 |
dtype: Optional[torch.dtype] = None,
|
| 242 |
):
|
| 243 |
+
# 修复:优先使用 text_encoder 的实际设备,而不是 self.device
|
| 244 |
+
# 这确保在 ZeroGPU 环境下设备一致性
|
| 245 |
+
if device is None:
|
| 246 |
+
device = next(self.text_encoder.parameters()).device
|
| 247 |
dtype = dtype or self.text_encoder.dtype
|
| 248 |
|
| 249 |
prompts = [prompts] if isinstance(prompts, str) else prompts
|
|
|
|
| 276 |
)
|
| 277 |
texts.append(text)
|
| 278 |
|
| 279 |
+
# Process inputs - 修复:使用 text_encoder 的实际设备
|
| 280 |
model_inputs = self.processor(
|
| 281 |
text=texts,
|
| 282 |
images=images,
|
| 283 |
do_resize=False, # already resized
|
| 284 |
padding=True,
|
| 285 |
return_tensors="pt"
|
| 286 |
+
).to(device)
|
| 287 |
|
| 288 |
# template = self.prompt_template_encode
|
| 289 |
drop_idx = self.prompt_template_encode_start_idx
|
|
|
|
| 332 |
max_sequence_length: int = 1024,
|
| 333 |
):
|
| 334 |
r"""
|
|
|
|
| 335 |
Args:
|
| 336 |
prompt (`str` or `List[str]`, *optional*):
|
| 337 |
prompt to be encoded
|
|
|
|
| 345 |
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
| 346 |
provided, text embeddings will be generated from `prompt` input argument.
|
| 347 |
"""
|
| 348 |
+
# 修复:优先使用 text_encoder 的实际设备
|
| 349 |
+
if device is None:
|
| 350 |
+
device = next(self.text_encoder.parameters()).device
|
| 351 |
|
| 352 |
prompt = [prompt] if isinstance(prompt, str) else prompt
|
| 353 |
batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
|
|
|
|
| 601 |
):
|
| 602 |
r"""
|
| 603 |
Function invoked when calling the pipeline for generation.
|
|
|
|
| 604 |
Args:
|
| 605 |
prompt (`str` or `List[str]`, *optional*):
|
| 606 |
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
|
|
|
| 628 |
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
| 629 |
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
| 630 |
the text `prompt`, usually at the expense of lower image quality.
|
|
|
|
| 631 |
This parameter in the pipeline is there to support future guidance-distilled models when they come up.
|
| 632 |
Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
|
| 633 |
please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
|
|
|
|
| 667 |
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
| 668 |
`._callback_tensor_inputs` attribute of your pipeline class.
|
| 669 |
max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
|
|
|
|
| 670 |
Examples:
|
|
|
|
| 671 |
Returns:
|
| 672 |
[`~pipelines.qwenimage.QwenImagePipelineOutput`] or `tuple`:
|
| 673 |
[`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
|
|
|
|
| 716 |
else:
|
| 717 |
batch_size = prompt_embeds.shape[0]
|
| 718 |
|
| 719 |
+
# 修复:使用 text_encoder 的实际设备,而不是 _execution_device
|
| 720 |
+
device = next(self.text_encoder.parameters()).device
|
| 721 |
+
|
| 722 |
# 3. Preprocess image
|
| 723 |
|
| 724 |
prompt_images = [image.resize((round(image.width * 28 / 32), round(image.height * 28 / 32))) for image in images]
|
|
|
|
| 905 |
if not return_dict:
|
| 906 |
return (image,)
|
| 907 |
|
| 908 |
+
return QwenImagePipelineOutput(images=image)
|