Buckets:

hf-doc-build
/

doc

Files

xet

hf-doc-build/doc / diffusers /v0.9.0 /en /optimization /fp16.html

rtrm

27 days ago

download

raw

49.6 kB

	<meta charset="utf-8" /><meta http-equiv="content-security-policy" content=""><meta name="hf:doc:metadata" content="{"local":"memory-and-speed","sections":[{"local":"enable-cudnn-autotuner","sections":[{"local":"use-tf32-instead-of-fp32-on-ampere-and-later-cuda-devices","title":"Use tf32 instead of fp32 (on Ampere and later CUDA devices)"}],"title":"Enable cuDNN auto-tuner"},{"local":"automatic-mixed-precision-amp","title":"Automatic mixed precision (AMP)"},{"local":"half-precision-weights","title":"Half precision weights"},{"local":"sliced-attention-for-additional-memory-savings","title":"Sliced attention for additional memory savings"},{"local":"sliced-vae-decode-for-larger-batches","title":"Sliced VAE decode for larger batches"},{"local":"offloading-to-cpu-with-accelerate-for-memory-savings","title":"Offloading to CPU with accelerate for memory savings"},{"local":"using-channels-last-memory-format","title":"Using Channels Last memory format"},{"local":"tracing","title":"Tracing"},{"local":"memory-efficient-attention","title":"Memory Efficient Attention"}],"title":"Memory and speed"}" data-svelte="svelte-1phssyn">
	<link rel="modulepreload" href="/docs/diffusers/v0.9.0/en/_app/assets/pages/__layout.svelte-hf-doc-builder.css">
	<link rel="modulepreload" href="/docs/diffusers/v0.9.0/en/_app/start-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.9.0/en/_app/chunks/vendor-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.9.0/en/_app/chunks/paths-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.9.0/en/_app/pages/__layout.svelte-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.9.0/en/_app/pages/optimization/fp16.mdx-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.9.0/en/_app/chunks/Tip-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.9.0/en/_app/chunks/IconCopyLink-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.9.0/en/_app/chunks/CodeBlock-hf-doc-builder.js">






	<h1 class="relative group"><a id="memory-and-speed" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-and-speed"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Memory and speed
	</span></h1>

	<p>We present some techniques and ideas to optimize 🤗 Diffusers <em>inference</em> for memory or speed.</p>
	<table><thead><tr><th></th>
	<th>Latency</th>
	<th>Speedup</th></tr></thead>
	<tbody><tr><td>original</td>
	<td>9.50s</td>
	<td>x1</td></tr>
	<tr><td>cuDNN auto-tuner</td>
	<td>9.37s</td>
	<td>x1.01</td></tr>
	<tr><td>autocast (fp16)</td>
	<td>5.47s</td>
	<td>x1.74</td></tr>
	<tr><td>fp16</td>
	<td>3.61s</td>
	<td>x2.63</td></tr>
	<tr><td>channels last</td>
	<td>3.30s</td>
	<td>x2.88</td></tr>
	<tr><td>traced UNet</td>
	<td>3.21s</td>
	<td>x2.96</td></tr>
	<tr><td>memory efficient attention</td>
	<td>2.63s</td>
	<td>x3.61</td></tr></tbody></table>
	<em>obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from
	the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM
	steps.
	</em>
	<h2 class="relative group"><a id="enable-cudnn-autotuner" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#enable-cudnn-autotuner"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Enable cuDNN auto-tuner
	</span></h2>

	<p><a href="https://developer.nvidia.com/cudnn" rel="nofollow">NVIDIA cuDNN</a> supports many algorithms to compute a convolution. Autotuner runs a short benchmark and selects the kernel with the best performance on a given hardware for a given input size.</p>
	<p>Since we’re using <strong>convolutional networks</strong> (other types currently not supported), we can enable cuDNN autotuner before launching the inference by setting:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch

	torch.backends.cudnn.benchmark = <span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div>
	<h3 class="relative group"><a id="use-tf32-instead-of-fp32-on-ampere-and-later-cuda-devices" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#use-tf32-instead-of-fp32-on-ampere-and-later-cuda-devices"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Use tf32 instead of fp32 (on Ampere and later CUDA devices)
	</span></h3>

	<p>On Ampere and later CUDA devices matrix multiplications and convolutions can use the TensorFloat32 (TF32) mode for faster but slightly less accurate computations. By default PyTorch enables TF32 mode for convolutions but not matrix multiplications, and unless a network requires full float32 precision we recommend enabling this setting for matrix multiplications, too. It can significantly speed up computations with typically negligible loss of numerical accuracy. You can read more about it <a href="https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32" rel="nofollow">here</a>. All you need to do is to add this before your inference:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch

	torch.backends.cuda.matmul.allow_tf32 = <span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div>
	<h2 class="relative group"><a id="automatic-mixed-precision-amp" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#automatic-mixed-precision-amp"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Automatic mixed precision (AMP)
	</span></h2>

	<p>If you use a CUDA GPU, you can take advantage of <code>torch.autocast</code> to perform inference roughly twice as fast at the cost of slightly lower precision. All you need to do is put your inference call inside an <code>autocast</code> context manager. The following example shows how to do it using Stable Diffusion text-to-image generation as an example:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torch <span class="hljs-keyword">import</span> autocast
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(<span class="hljs-string">"runwayml/stable-diffusion-v1-5"</span>)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	<span class="hljs-keyword">with</span> autocast(<span class="hljs-string">"cuda"</span>):
	image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div>
	<p>Despite the precision loss, in our experience the final image results look the same as the <code>float32</code> versions. Feel free to experiment and report back!</p>
	<h2 class="relative group"><a id="half-precision-weights" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#half-precision-weights"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Half precision weights
	</span></h2>

	<p>To save more GPU memory and get even more speed, you can load and run the model weights directly in half precision. This involves loading the float16 version of the weights, which was saved to a branch named <code>fp16</code>, and telling PyTorch to use the <code>float16</code> type when loading them:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START -->pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"runwayml/stable-diffusion-v1-5"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div>
	<h2 class="relative group"><a id="sliced-attention-for-additional-memory-savings" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sliced-attention-for-additional-memory-savings"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Sliced attention for additional memory savings
	</span></h2>

	<p>For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once.</p>


	<div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">Attention slicing is useful even if a batch size of just 1 is used - as long
	as the model uses more than one attention head. If there is more than one
	attention head the QK^T attention matrix can be computed sequentially for
	each head which can save a significant amount of memory.
	</div>
	<p>To perform the attention computation sequentially over each head, you only need to invoke <a href="/docs/diffusers/v0.9.0/en/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline.enable_attention_slicing">enable_attention_slicing()</a> in your pipeline before inference, like here:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"runwayml/stable-diffusion-v1-5"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	pipe.enable_attention_slicing()
	image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div>
	<p>There’s a small performance penalty of about 10% slower inference times, but this method allows you to use Stable Diffusion in as little as 3.2 GB of VRAM!</p>
	<h2 class="relative group"><a id="sliced-vae-decode-for-larger-batches" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sliced-vae-decode-for-larger-batches"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Sliced VAE decode for larger batches
	</span></h2>

	<p>To decode large batches of images with limited VRAM, or to enable batches with 32 images or more, you can use sliced VAE decode that decodes the batch latents one image at a time.</p>
	<p>You likely want to couple this with <a href="/docs/diffusers/v0.9.0/en/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline.enable_attention_slicing">enable_attention_slicing()</a> or <a href="/docs/diffusers/v0.9.0/en/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline.enable_xformers_memory_efficient_attention">enable_xformers_memory_efficient_attention()</a> to further minimize memory use.</p>
	<p>To perform the VAE decode one image at a time, invoke <a href="/docs/diffusers/v0.9.0/en/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline.enable_vae_slicing">enable_vae_slicing()</a> in your pipeline before inference. For example:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"runwayml/stable-diffusion-v1-5"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	pipe.enable_vae_slicing()
	images = pipe([prompt] * <span class="hljs-number">32</span>).images<!-- HTML_TAG_END --></pre></div>
	<p>You may see a small performance boost in VAE decode on multi-image batches. There should be no performance impact on single-image batches.</p>
	<h2 class="relative group"><a id="offloading-to-cpu-with-accelerate-for-memory-savings" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#offloading-to-cpu-with-accelerate-for-memory-savings"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Offloading to CPU with accelerate for memory savings
	</span></h2>

	<p>For additional memory savings, you can offload the weights to CPU and load them to GPU when performing the forward pass.</p>
	<p>To perform CPU offloading, all you have to do is invoke <code>enable_sequential_cpu_offload()</code>:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"runwayml/stable-diffusion-v1-5"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	pipe.enable_sequential_cpu_offload()
	image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div>
	<p>And you can get the memory consumption to < 2GB.</p>
	<p>If is also possible to chain it with attention slicing for minimal memory consumption, running it in as little as < 800mb of GPU vRAM:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"runwayml/stable-diffusion-v1-5"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	pipe.enable_sequential_cpu_offload()
	pipe.enable_attention_slicing(<span class="hljs-number">1</span>)

	image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div>
	<h2 class="relative group"><a id="using-channels-last-memory-format" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-channels-last-memory-format"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Using Channels Last memory format
	</span></h2>

	<p>Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it’s better to try it and see if it works for your model.</p>
	<p>For example, in order to set the UNet model in our pipeline to use channels last format, we can use the following:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(pipe.unet.conv_out.state_dict()[<span class="hljs-string">"weight"</span>].stride()) <span class="hljs-comment"># (2880, 9, 3, 1)</span>
	pipe.unet.to(memory_format=torch.channels_last) <span class="hljs-comment"># in-place operation</span>
	<span class="hljs-built_in">print</span>(
	pipe.unet.conv_out.state_dict()[<span class="hljs-string">"weight"</span>].stride()
	) <span class="hljs-comment"># (2880, 1, 960, 320) having a stride of 1 for the 2nd dimension proves that it works</span><!-- HTML_TAG_END --></pre></div>
	<h2 class="relative group"><a id="tracing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tracing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Tracing
	</span></h2>

	<p>Tracing runs an example input tensor through your model, and captures the operations that are invoked as that input makes its way through the model’s layers so that an executable or <code>ScriptFunction</code> is returned that will be optimized using just-in-time compilation.</p>
	<p>To trace our UNet model, we can use the following:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> time
	<span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
	<span class="hljs-keyword">import</span> functools

	<span class="hljs-comment"># torch disable grad</span>
	torch.set_grad_enabled(<span class="hljs-literal">False</span>)

	<span class="hljs-comment"># set variables</span>
	n_experiments = <span class="hljs-number">2</span>
	unet_runs_per_experiment = <span class="hljs-number">50</span>

	<span class="hljs-comment"># load inputs</span>
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">generate_inputs</span>():
	sample = torch.randn(<span class="hljs-number">2</span>, <span class="hljs-number">4</span>, <span class="hljs-number">64</span>, <span class="hljs-number">64</span>).half().cuda()
	timestep = torch.rand(<span class="hljs-number">1</span>).half().cuda() * <span class="hljs-number">999</span>
	encoder_hidden_states = torch.randn(<span class="hljs-number">2</span>, <span class="hljs-number">77</span>, <span class="hljs-number">768</span>).half().cuda()
	<span class="hljs-keyword">return</span> sample, timestep, encoder_hidden_states


	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"runwayml/stable-diffusion-v1-5"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	).to(<span class="hljs-string">"cuda"</span>)
	unet = pipe.unet
	unet.<span class="hljs-built_in">eval</span>()
	unet.to(memory_format=torch.channels_last) <span class="hljs-comment"># use channels_last memory format</span>
	unet.forward = functools.partial(unet.forward, return_dict=<span class="hljs-literal">False</span>) <span class="hljs-comment"># set return_dict=False as default</span>

	<span class="hljs-comment"># warmup</span>
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">3</span>):
	<span class="hljs-keyword">with</span> torch.inference_mode():
	inputs = generate_inputs()
	orig_output = unet(*inputs)

	<span class="hljs-comment"># trace</span>
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"tracing.."</span>)
	unet_traced = torch.jit.trace(unet, inputs)
	unet_traced.<span class="hljs-built_in">eval</span>()
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"done tracing"</span>)


	<span class="hljs-comment"># warmup and optimize graph</span>
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">5</span>):
	<span class="hljs-keyword">with</span> torch.inference_mode():
	inputs = generate_inputs()
	orig_output = unet_traced(*inputs)


	<span class="hljs-comment"># benchmarking</span>
	<span class="hljs-keyword">with</span> torch.inference_mode():
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_experiments):
	torch.cuda.synchronize()
	start_time = time.time()
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(unet_runs_per_experiment):
	orig_output = unet_traced(*inputs)
	torch.cuda.synchronize()
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"unet traced inference took <span class="hljs-subst">{time.time() - start_time:<span class="hljs-number">.2</span>f}</span> seconds"</span>)
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_experiments):
	torch.cuda.synchronize()
	start_time = time.time()
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(unet_runs_per_experiment):
	orig_output = unet(*inputs)
	torch.cuda.synchronize()
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"unet inference took <span class="hljs-subst">{time.time() - start_time:<span class="hljs-number">.2</span>f}</span> seconds"</span>)

	<span class="hljs-comment"># save the model</span>
	unet_traced.save(<span class="hljs-string">"unet_traced.pt"</span>)<!-- HTML_TAG_END --></pre></div>
	<p>Then we can replace the <code>unet</code> attribute of the pipeline with the traced model like the following</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
	<span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> dataclasses <span class="hljs-keyword">import</span> dataclass


	<span class="hljs-meta">@dataclass</span>
	<span class="hljs-keyword">class</span> <span class="hljs-title class_">UNet2DConditionOutput</span>:
	sample: torch.FloatTensor


	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"runwayml/stable-diffusion-v1-5"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	).to(<span class="hljs-string">"cuda"</span>)

	<span class="hljs-comment"># use jitted unet</span>
	unet_traced = torch.jit.load(<span class="hljs-string">"unet_traced.pt"</span>)
	<span class="hljs-comment"># del pipe.unet</span>
	<span class="hljs-keyword">class</span> <span class="hljs-title class_">TracedUNet</span>(torch.nn.Module):
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>):
	<span class="hljs-built_in">super</span>().__init__()
	self.in_channels = pipe.unet.in_channels
	self.device = pipe.unet.device

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">self, latent_model_input, t, encoder_hidden_states</span>):
	sample = unet_traced(latent_model_input, t, encoder_hidden_states)[<span class="hljs-number">0</span>]
	<span class="hljs-keyword">return</span> UNet2DConditionOutput(sample=sample)


	pipe.unet = TracedUNet()

	<span class="hljs-keyword">with</span> torch.inference_mode():
	image = pipe([prompt] * <span class="hljs-number">1</span>, num_inference_steps=<span class="hljs-number">50</span>).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div>
	<h2 class="relative group"><a id="memory-efficient-attention" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-efficient-attention"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Memory Efficient Attention
	</span></h2>

	Recent work on optimizing the bandwitdh in the attention block have generated huge speed ups and gains in GPU memory usage. The most recent being Flash Attention (from @tridao, [code](https://github.com/HazyResearch/flash-attention), [paper](https://arxiv.org/pdf/2205.14135.pdf)) .
	Here are the speedups we obtain on a few Nvidia GPUs when running the inference at 512x512 with a batch size of 1 (one prompt):
	<table><thead><tr><th>GPU</th>
	<th>Base Attention FP16</th>
	<th>Memory Efficient Attention FP16</th></tr></thead>
	<tbody><tr><td>NVIDIA Tesla T4</td>
	<td>3.5it/s</td>
	<td>5.5it/s</td></tr>
	<tr><td>NVIDIA 3060 RTX</td>
	<td>4.6it/s</td>
	<td>7.8it/s</td></tr>
	<tr><td>NVIDIA A10G</td>
	<td>8.88it/s</td>
	<td>15.6it/s</td></tr>
	<tr><td>NVIDIA RTX A6000</td>
	<td>11.7it/s</td>
	<td>21.09it/s</td></tr>
	<tr><td>NVIDIA TITAN RTX</td>
	<td>12.51it/s</td>
	<td>18.22it/s</td></tr>
	<tr><td>A100-SXM4-40GB</td>
	<td>18.6it/s</td>
	<td>29.it/s</td></tr>
	<tr><td>A100-SXM-80GB</td>
	<td>18.7it/s</td>
	<td>29.5it/s</td></tr></tbody></table>
	<p>To leverage it just make sure you have: </p>
	<ul><li>PyTorch > 1.12</li>
	<li>Cuda available</li>
	<li>Installed the <a href="https://github.com/facebookresearch/xformers" rel="nofollow">xformers</a> library</li></ul>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
	<span class="hljs-keyword">import</span> torch

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"runwayml/stable-diffusion-v1-5"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	).to(<span class="hljs-string">"cuda"</span>)

	pipe.enable_xformers_memory_efficient_attention()

	<span class="hljs-keyword">with</span> torch.inference_mode():
	sample = pipe(<span class="hljs-string">"a small cat"</span>)

	<span class="hljs-comment"># optional: You can disable it via</span>
	<span class="hljs-comment"># pipe.disable_xformers_memory_efficient_attention()</span><!-- HTML_TAG_END --></pre></div>


	<script type="module" data-hydrate="b3hg2f">
	import { start } from "/docs/diffusers/v0.9.0/en/_app/start-hf-doc-builder.js";
	start({
	target: document.querySelector('[data-hydrate="b3hg2f"]').parentNode,
	paths: {"base":"/docs/diffusers/v0.9.0/en","assets":"/docs/diffusers/v0.9.0/en"},
	session: {},
	route: false,
	spa: false,
	trailing_slash: "never",
	hydrate: {
	status: 200,
	error: null,
	nodes: [
	import("/docs/diffusers/v0.9.0/en/_app/pages/__layout.svelte-hf-doc-builder.js"),
	import("/docs/diffusers/v0.9.0/en/_app/pages/optimization/fp16.mdx-hf-doc-builder.js")
	],
	params: {}
	}
	});
	</script>

Xet Storage Details

Size:: 49.6 kB
Xet hash:: 74d9d380b1ff34b287093ff7262a05a136f663a7c574de43788aea2ad84840d1

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.