Buckets:

rtrm's picture
download
raw
26.1 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Introduction to Stable Diffusion&quot;,&quot;local&quot;:&quot;introduction-to-stable-diffusion&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;What Do We Need for Stable Diffusion to Work?&quot;,&quot;local&quot;:&quot;what-do-we-need-for-stable-diffusion-to-work&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;How to use text-to-image , image-to-image , Inpainting Models in Diffusers&quot;,&quot;local&quot;:&quot;how-to-use-text-to-image--image-to-image--inpainting-models-in-diffusers&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Further Reading&quot;,&quot;local&quot;:&quot;further-reading&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/computer-vision-course/pr_397/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/entry/start.7f209408.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/scheduler.7bc62968.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/singletons.b15acae1.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/paths.11cdc4b4.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/entry/app.32e8338e.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/index.2f8492b0.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/nodes/0.e37092e8.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/nodes/64.394cfb13.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/CodeBlock.bb61a5a9.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/index.514d62da.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Introduction to Stable Diffusion&quot;,&quot;local&quot;:&quot;introduction-to-stable-diffusion&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;What Do We Need for Stable Diffusion to Work?&quot;,&quot;local&quot;:&quot;what-do-we-need-for-stable-diffusion-to-work&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;How to use text-to-image , image-to-image , Inpainting Models in Diffusers&quot;,&quot;local&quot;:&quot;how-to-use-text-to-image--image-to-image--inpainting-models-in-diffusers&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Further Reading&quot;,&quot;local&quot;:&quot;further-reading&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="introduction-to-stable-diffusion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#introduction-to-stable-diffusion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Introduction to Stable Diffusion</span></h1> <p data-svelte-h="svelte-z66int">This chapter introduces the building blocks of Stable Diffusion which is a generative artificial intelligence (generative AI) model that produces unique photorealistic images from text and image prompts. It originally launched in 2022 and was made possible thanks to a collaboration with
<a href="https://stability.ai/" rel="nofollow">Stability AI</a>, <a href="https://runwayml.com/" rel="nofollow">RunwayML</a> and CompVis Group at LMU Munich following the <a href="https://arxiv.org/pdf/2112.10752.pdf" rel="nofollow">paper</a>.</p> <p data-svelte-h="svelte-1sm6n8h">What will you learn from this chapter?</p> <ul data-svelte-h="svelte-15b6ktr"><li>Fundamental components of Stable Diffusion</li> <li>How to use <code>text-to-image</code>, <code>image2image</code>, inpainting pipelines</li></ul> <h2 class="relative group"><a id="what-do-we-need-for-stable-diffusion-to-work" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-do-we-need-for-stable-diffusion-to-work"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What Do We Need for Stable Diffusion to Work?</span></h2> <p data-svelte-h="svelte-15v5s0q">To make this section interesting we will try to answer some questions to understand the basic components of the Stable Diffusion process.
We will briefly discuss each component as they are already covered in our Diffusers course. Also, you can visit our previous section, which talks about GANS and Diffusion models in details.</p> <ul data-svelte-h="svelte-cbr84n"><li>What strategies does Stable Diffusion employ to learn new information?<ul><li>It uses forward and reverse processes of diffusion models. In the forward process, we add Gaussian noise to an image until all that remains is the random noise. Usually we cannot identify the final noisy version of the image.</li> <li>In the reserve process, we have a learned neural network trained to gradually denoise an image starting from pure noise, until you end up with an actual image.</li></ul></li></ul> <p>Both of these processes happens for a finite number of steps <code data-svelte-h="svelte-18tc35m">T</code>(as per DDPM paper T=1000). You begin the process at time<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>t</mi><mn>0</mn></msub></mrow><annotation encoding="application/x-tex">t_0</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.7651em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">t</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">0</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --> by sampling a real image from your data distribution, and the forward process samples some noise from a Gaussian distribution at each time step t, which is added to the image of the previous time step. To get more mathematical intuition, please read <a href="https://huggingface.co/blog/annotated-diffusion" rel="nofollow" data-svelte-h="svelte-1h13rvl">Hugging Face Blog</a> on Diffusion Models.</p> <ul data-svelte-h="svelte-nhvgmx"><li>Since our images can be huge how can we compress it?</li></ul> <p data-svelte-h="svelte-ghqky8">When you have large images, they require more computing power to process. This becomes very noticeable in a specific operation known as self-attention. The bigger the image, the more calculations are needed, and these calculations increase very quickly (in a way mathematicians call “quadratically”) with the size of the image.
For example, if you have an image that’s 128 pixels wide and tall, it has four times more pixels than an image that’s only 64 pixels wide and tall. Because of how self-attention works, dealing with this larger image doesn’t just need four times more memory and computing power, it actually needs sixteen times more (since 4 times 4 equals 16). This makes it challenging to work with very high-resolution images, as they require a lot of resources to process.
Latent diffusion models address the high computational demands of processing large images by using a Variational Auto-Encoder (VAE) to shrink the images into a more manageable size. The idea is that many images have repetitive or unnecessary information. A VAE, after being trained on a lot of data, can compress an image into a much smaller, condensed form. This smaller version still retains the essential features of the original image.</p> <ul data-svelte-h="svelte-wgoof5"><li>How are we fusing texts with images since we are using prompts?</li></ul> <p data-svelte-h="svelte-1xqaret">We know that during inference time, we can feed in the description of an image we’d like to see and some pure noise as a starting point, and the model does its best to ‘denoise’ the random input into something that matches the caption.
SD leverages a pre-trained transformer model based on something called <a href="https://huggingface.co/learn/computer-vision-course/unit4/multimodal-models/clip-and-relatives/clip" rel="nofollow">CLIP</a>. CLIP’s text encoder was designed to process image captions into a form that could be used to compare images and text, so it is well suited to the task of creating useful representations from image descriptions. An input prompt is first tokenized (based on a large vocabulary where each word or sub-word is assigned a specific token) and then fed through the CLIP text encoder, producing a 768-dimensional (in the case of SD 1.X) or 1024-dimensional (SD 2.X) vector for each token. To keep things consistent prompts are always padded/truncated to be 77 tokens long, and so the final representation which we use as conditioning is a tensor of shape 77x1024 per prompt.</p> <ul data-svelte-h="svelte-1gci2og"><li>How can we add-in good inductive biases?</li></ul> <p data-svelte-h="svelte-ceb1g8">Since, we are trying to generate something new(e.g., a realistic Pokemon), we need a way to go beyond the images we have seen before(e.g., an anime Pokemon). That’s where U-Net and self-attention come into the picture. Given a noisy version of an image, the model is tasked with predicting the denoised version based on additional clues such as a text description of the image. Ok, how do we actually feed this conditioning information into the U-Net for it to use as it makes predictions? The answer is something called cross-attention. Scattered throughout the U-Net are cross-attention layers.
Each spatial location in the U-Net can ‘attend’ to different tokens in the text conditioning, bringing in relevant information from the prompt.</p> <h2 class="relative group"><a id="how-to-use-text-to-image--image-to-image--inpainting-models-in-diffusers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-use-text-to-image--image-to-image--inpainting-models-in-diffusers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to use text-to-image , image-to-image , Inpainting Models in Diffusers</span></h2> <p data-svelte-h="svelte-1tbjqci">This section introduces helpful usecases and how we can perform these tasks using the <a href="https://github.com/huggingface/diffusers" rel="nofollow">Diffusers</a> library.</p> <ul data-svelte-h="svelte-4vkcyc"><li>Steps for <code>text-to-image</code> inference
The idea is to pass in the text prompt, which is converted to the output image.</li></ul> <iframe src="https://hysts-controlnet-v1-1.hf.space/" frameborder="0" width="850" height="450" data-svelte-h="svelte-1du1c08"></iframe> <p data-svelte-h="svelte-1xlt128">Using the <code>diffusers</code> library you can get <code>text-to-image</code> working in 2 steps.</p> <p data-svelte-h="svelte-3yyytc">Let’s install the <code>diffusers</code> library first.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install diffusers<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18bx7u2">We will now initialize the pipeline and pass our prompt inside and infer.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoPipelineForText2Image
<span class="hljs-keyword">import</span> torch
pipeline = AutoPipelineForText2Image.from_pretrained(
<span class="hljs-string">&quot;runwayml/stable-diffusion-v1-5&quot;</span>, torch_dtype=torch.float16, variant=<span class="hljs-string">&quot;fp16&quot;</span>
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
generator = torch.Generator(device=<span class="hljs-string">&quot;cuda&quot;</span>).manual_seed(<span class="hljs-number">31</span>)
image = pipeline(
<span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>,
generator=generator,
).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <ul data-svelte-h="svelte-k07d6l"><li>Steps for image-to-image inference
In similar fashion, we can initialize the pipeline, but pass an image and a text prompt instead.</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoPipelineForImage2Image
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, make_image_grid
pipeline = AutoPipelineForImage2Image.from_pretrained(
<span class="hljs-string">&quot;kandinsky-community/kandinsky-2-2-decoder&quot;</span>,
torch_dtype=torch.float16,
use_safetensors=<span class="hljs-literal">True</span>,
)
pipeline.enable_model_cpu_offload()
<span class="hljs-comment"># remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed</span>
pipeline.enable_xformers_memory_efficient_attention()
<span class="hljs-comment"># Load an image to pass to the pipeline:</span>
init_image = load_image(
<span class="hljs-string">&quot;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png&quot;</span>
)
<span class="hljs-comment"># Pass a prompt and image to the pipeline to generate an image:</span>
prompt = <span class="hljs-string">&quot;cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k&quot;</span>
image = pipeline(prompt, image=init_image).images[<span class="hljs-number">0</span>]
make_image_grid([init_image, image], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">2</span>)<!-- HTML_TAG_END --></pre></div> <ul data-svelte-h="svelte-3vw8pj"><li>Steps for Inpainting
For inpainting pipeline, we need to pass an image, a text prompt, and a mask based on an object in that image, which indicates what to inpaint in the image.
In this example we also pass a negative prompt to further influence the inference on what we want to avoid.</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Load the pipeline</span>
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoPipelineForInpainting
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, make_image_grid
pipeline = AutoPipelineForInpainting.from_pretrained(
<span class="hljs-string">&quot;kandinsky-community/kandinsky-2-2-decoder-inpaint&quot;</span>, torch_dtype=torch.float16
)
pipeline.enable_model_cpu_offload()
<span class="hljs-comment"># remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed</span>
pipeline.enable_xformers_memory_efficient_attention()
<span class="hljs-comment"># Load the base and mask images:</span>
init_image = load_image(
<span class="hljs-string">&quot;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png&quot;</span>
)
mask_image = load_image(
<span class="hljs-string">&quot;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png&quot;</span>
)
<span class="hljs-comment"># Create a prompt to inpaint the image with and pass it to the pipeline with the base and mask images:</span>
prompt = (
<span class="hljs-string">&quot;a black cat with glowing eyes, cute, adorable, disney, pixar, highly detailed, 8k&quot;</span>
)
negative_prompt = <span class="hljs-string">&quot;bad anatomy, deformed, ugly, disfigured&quot;</span>
image = pipeline(
prompt=prompt,
negative_prompt=negative_prompt,
image=init_image,
mask_image=mask_image,
).images[<span class="hljs-number">0</span>]
make_image_grid([init_image, mask_image, image], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">3</span>)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="further-reading" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#further-reading"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Further Reading</span></h3> <ul data-svelte-h="svelte-cvv663"><li><a href="https://huggingface.co/docs/diffusers/using-diffusers/pipeline_overview" rel="nofollow">Diffusers documentation</a></li> <li><a href="https://huggingface.co/docs/diffusers/installation" rel="nofollow">Diffusers installation</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/computer-vision-course/blob/main/chapters/en/unit5/generative-models/diffusion-models/stable-diffusion.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1p6gie1 = {
assets: "/docs/computer-vision-course/pr_397/en",
base: "/docs/computer-vision-course/pr_397/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/computer-vision-course/pr_397/en/_app/immutable/entry/start.7f209408.js"),
import("/docs/computer-vision-course/pr_397/en/_app/immutable/entry/app.32e8338e.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 64],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
26.1 kB
·
Xet hash:
964d3baa587785c69bfa896e604df78a2a11560d0ca8920e89bd537100a0d7ad

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.