Replaced init_image with image to match git change.
Browse files- pipeline.py +26 -23
pipeline.py
CHANGED
|
@@ -579,7 +579,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 579 |
self,
|
| 580 |
prompt: Union[str, List[str]],
|
| 581 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
| 582 |
-
|
| 583 |
mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
| 584 |
height: int = 512,
|
| 585 |
width: int = 512,
|
|
@@ -607,11 +607,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 607 |
negative_prompt (`str` or `List[str]`, *optional*):
|
| 608 |
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
| 609 |
if `guidance_scale` is less than `1`).
|
| 610 |
-
|
| 611 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
| 612 |
process.
|
| 613 |
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
| 614 |
-
`Image`, or tensor representing an image batch, to mask `
|
| 615 |
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
| 616 |
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
| 617 |
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
|
@@ -629,11 +629,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 629 |
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
| 630 |
usually at the expense of lower image quality.
|
| 631 |
strength (`float`, *optional*, defaults to 0.8):
|
| 632 |
-
Conceptually, indicates how much to transform the reference `
|
| 633 |
-
`
|
| 634 |
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
| 635 |
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
| 636 |
-
`num_inference_steps`. A value of 1, therefore, essentially ignores `
|
| 637 |
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
| 638 |
The number of images to generate per prompt.
|
| 639 |
eta (`float`, *optional*, defaults to 0.0):
|
|
@@ -672,6 +672,9 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 672 |
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
| 673 |
(nsfw) content, according to the `safety_checker`.
|
| 674 |
"""
|
|
|
|
|
|
|
|
|
|
| 675 |
|
| 676 |
if isinstance(prompt, str):
|
| 677 |
batch_size = 1
|
|
@@ -738,7 +741,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 738 |
mask = None
|
| 739 |
noise = None
|
| 740 |
|
| 741 |
-
if
|
| 742 |
# get the initial random noise unless the user supplied it
|
| 743 |
|
| 744 |
# Unlike in other pipelines, latents need to be generated in the target device
|
|
@@ -777,11 +780,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 777 |
# scale the initial noise by the standard deviation required by the scheduler
|
| 778 |
latents = latents * self.scheduler.init_noise_sigma
|
| 779 |
else:
|
| 780 |
-
if isinstance(
|
| 781 |
-
|
| 782 |
# encode the init image into latents and scale the latents
|
| 783 |
-
|
| 784 |
-
init_latent_dist = self.vae.encode(
|
| 785 |
init_latents = init_latent_dist.sample(generator=generator)
|
| 786 |
init_latents = 0.18215 * init_latents
|
| 787 |
init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
|
|
@@ -796,7 +799,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 796 |
|
| 797 |
# check sizes
|
| 798 |
if not mask.shape == init_latents.shape:
|
| 799 |
-
raise ValueError("The mask and
|
| 800 |
|
| 801 |
# get the original timestep using init_timestep
|
| 802 |
offset = self.scheduler.config.get("steps_offset", 0)
|
|
@@ -985,7 +988,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 985 |
|
| 986 |
def img2img(
|
| 987 |
self,
|
| 988 |
-
|
| 989 |
prompt: Union[str, List[str]],
|
| 990 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
| 991 |
strength: float = 0.8,
|
|
@@ -1004,7 +1007,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 1004 |
r"""
|
| 1005 |
Function for image-to-image generation.
|
| 1006 |
Args:
|
| 1007 |
-
|
| 1008 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
| 1009 |
process.
|
| 1010 |
prompt (`str` or `List[str]`):
|
|
@@ -1013,11 +1016,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 1013 |
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
| 1014 |
if `guidance_scale` is less than `1`).
|
| 1015 |
strength (`float`, *optional*, defaults to 0.8):
|
| 1016 |
-
Conceptually, indicates how much to transform the reference `
|
| 1017 |
-
`
|
| 1018 |
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
| 1019 |
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
| 1020 |
-
`num_inference_steps`. A value of 1, therefore, essentially ignores `
|
| 1021 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
| 1022 |
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
| 1023 |
expense of slower inference. This parameter will be modulated by `strength`.
|
|
@@ -1059,7 +1062,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 1059 |
return self.__call__(
|
| 1060 |
prompt=prompt,
|
| 1061 |
negative_prompt=negative_prompt,
|
| 1062 |
-
|
| 1063 |
num_inference_steps=num_inference_steps,
|
| 1064 |
guidance_scale=guidance_scale,
|
| 1065 |
strength=strength,
|
|
@@ -1076,7 +1079,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 1076 |
|
| 1077 |
def inpaint(
|
| 1078 |
self,
|
| 1079 |
-
|
| 1080 |
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
| 1081 |
prompt: Union[str, List[str]],
|
| 1082 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
|
@@ -1096,11 +1099,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 1096 |
r"""
|
| 1097 |
Function for inpaint.
|
| 1098 |
Args:
|
| 1099 |
-
|
| 1100 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
| 1101 |
process. This is the image whose masked region will be inpainted.
|
| 1102 |
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
| 1103 |
-
`Image`, or tensor representing an image batch, to mask `
|
| 1104 |
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
| 1105 |
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
| 1106 |
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
|
@@ -1112,7 +1115,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 1112 |
strength (`float`, *optional*, defaults to 0.8):
|
| 1113 |
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
| 1114 |
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
| 1115 |
-
in `num_inference_steps`. `
|
| 1116 |
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
| 1117 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
| 1118 |
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
|
@@ -1155,7 +1158,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
|
| 1155 |
return self.__call__(
|
| 1156 |
prompt=prompt,
|
| 1157 |
negative_prompt=negative_prompt,
|
| 1158 |
-
|
| 1159 |
mask_image=mask_image,
|
| 1160 |
num_inference_steps=num_inference_steps,
|
| 1161 |
guidance_scale=guidance_scale,
|
|
|
|
| 579 |
self,
|
| 580 |
prompt: Union[str, List[str]],
|
| 581 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
| 582 |
+
image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
| 583 |
mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
| 584 |
height: int = 512,
|
| 585 |
width: int = 512,
|
|
|
|
| 607 |
negative_prompt (`str` or `List[str]`, *optional*):
|
| 608 |
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
| 609 |
if `guidance_scale` is less than `1`).
|
| 610 |
+
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
| 611 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
| 612 |
process.
|
| 613 |
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
| 614 |
+
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
| 615 |
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
| 616 |
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
| 617 |
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
|
|
|
| 629 |
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
| 630 |
usually at the expense of lower image quality.
|
| 631 |
strength (`float`, *optional*, defaults to 0.8):
|
| 632 |
+
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
| 633 |
+
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
| 634 |
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
| 635 |
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
| 636 |
+
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
| 637 |
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
| 638 |
The number of images to generate per prompt.
|
| 639 |
eta (`float`, *optional*, defaults to 0.0):
|
|
|
|
| 672 |
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
| 673 |
(nsfw) content, according to the `safety_checker`.
|
| 674 |
"""
|
| 675 |
+
message = "Please use `image` instead of `init_image`."
|
| 676 |
+
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
| 677 |
+
image = init_image or image
|
| 678 |
|
| 679 |
if isinstance(prompt, str):
|
| 680 |
batch_size = 1
|
|
|
|
| 741 |
mask = None
|
| 742 |
noise = None
|
| 743 |
|
| 744 |
+
if image is None:
|
| 745 |
# get the initial random noise unless the user supplied it
|
| 746 |
|
| 747 |
# Unlike in other pipelines, latents need to be generated in the target device
|
|
|
|
| 780 |
# scale the initial noise by the standard deviation required by the scheduler
|
| 781 |
latents = latents * self.scheduler.init_noise_sigma
|
| 782 |
else:
|
| 783 |
+
if isinstance(image, PIL.Image.Image):
|
| 784 |
+
image = preprocess_image(image)
|
| 785 |
# encode the init image into latents and scale the latents
|
| 786 |
+
image = image.to(device=self.device, dtype=latents_dtype)
|
| 787 |
+
init_latent_dist = self.vae.encode(image).latent_dist
|
| 788 |
init_latents = init_latent_dist.sample(generator=generator)
|
| 789 |
init_latents = 0.18215 * init_latents
|
| 790 |
init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
|
|
|
|
| 799 |
|
| 800 |
# check sizes
|
| 801 |
if not mask.shape == init_latents.shape:
|
| 802 |
+
raise ValueError("The mask and image should be the same size!")
|
| 803 |
|
| 804 |
# get the original timestep using init_timestep
|
| 805 |
offset = self.scheduler.config.get("steps_offset", 0)
|
|
|
|
| 988 |
|
| 989 |
def img2img(
|
| 990 |
self,
|
| 991 |
+
image: Union[torch.FloatTensor, PIL.Image.Image],
|
| 992 |
prompt: Union[str, List[str]],
|
| 993 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
| 994 |
strength: float = 0.8,
|
|
|
|
| 1007 |
r"""
|
| 1008 |
Function for image-to-image generation.
|
| 1009 |
Args:
|
| 1010 |
+
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
| 1011 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
| 1012 |
process.
|
| 1013 |
prompt (`str` or `List[str]`):
|
|
|
|
| 1016 |
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
| 1017 |
if `guidance_scale` is less than `1`).
|
| 1018 |
strength (`float`, *optional*, defaults to 0.8):
|
| 1019 |
+
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
| 1020 |
+
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
| 1021 |
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
| 1022 |
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
| 1023 |
+
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
| 1024 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
| 1025 |
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
| 1026 |
expense of slower inference. This parameter will be modulated by `strength`.
|
|
|
|
| 1062 |
return self.__call__(
|
| 1063 |
prompt=prompt,
|
| 1064 |
negative_prompt=negative_prompt,
|
| 1065 |
+
image=image,
|
| 1066 |
num_inference_steps=num_inference_steps,
|
| 1067 |
guidance_scale=guidance_scale,
|
| 1068 |
strength=strength,
|
|
|
|
| 1079 |
|
| 1080 |
def inpaint(
|
| 1081 |
self,
|
| 1082 |
+
image: Union[torch.FloatTensor, PIL.Image.Image],
|
| 1083 |
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
| 1084 |
prompt: Union[str, List[str]],
|
| 1085 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
|
|
|
| 1099 |
r"""
|
| 1100 |
Function for inpaint.
|
| 1101 |
Args:
|
| 1102 |
+
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
| 1103 |
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
| 1104 |
process. This is the image whose masked region will be inpainted.
|
| 1105 |
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
| 1106 |
+
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
| 1107 |
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
| 1108 |
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
| 1109 |
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
|
|
|
| 1115 |
strength (`float`, *optional*, defaults to 0.8):
|
| 1116 |
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
| 1117 |
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
| 1118 |
+
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
|
| 1119 |
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
| 1120 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
| 1121 |
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
|
|
|
| 1158 |
return self.__call__(
|
| 1159 |
prompt=prompt,
|
| 1160 |
negative_prompt=negative_prompt,
|
| 1161 |
+
image=image,
|
| 1162 |
mask_image=mask_image,
|
| 1163 |
num_inference_steps=num_inference_steps,
|
| 1164 |
guidance_scale=guidance_scale,
|