Buckets:

hf-doc-build
/

doc

Files

xet

hf-doc-build/doc / diffusers /v0.5.0 /en /optimization /fp16.html

rtrm

27 days ago

download

raw

35.2 kB

	<meta charset="utf-8" /><meta http-equiv="content-security-policy" content=""><meta name="hf:doc:metadata" content="{"local":"memory-and-speed","sections":[{"local":"enable-cudnn-autotuner","sections":[{"local":"use-tf32-instead-of-fp32-on-ampere-and-later-cuda-devices","title":"Use tf32 instead of fp32 (on Ampere and later CUDA devices)"}],"title":"Enable cuDNN auto-tuner"},{"local":"automatic-mixed-precision-amp","title":"Automatic mixed precision (AMP)"},{"local":"half-precision-weights","title":"Half precision weights"},{"local":"sliced-attention-for-additional-memory-savings","title":"Sliced attention for additional memory savings"},{"local":"using-channels-last-memory-format","title":"Using Channels Last memory format"},{"local":"tracing","title":"Tracing"}],"title":"Memory and speed"}" data-svelte="svelte-1phssyn">
	<link rel="modulepreload" href="/docs/diffusers/v0.5.0/en/_app/assets/pages/__layout.svelte-hf-doc-builder.css">
	<link rel="modulepreload" href="/docs/diffusers/v0.5.0/en/_app/start-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.5.0/en/_app/chunks/vendor-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.5.0/en/_app/chunks/paths-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.5.0/en/_app/pages/__layout.svelte-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.5.0/en/_app/pages/optimization/fp16.mdx-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.5.0/en/_app/chunks/Tip-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.5.0/en/_app/chunks/IconCopyLink-hf-doc-builder.js">
	<link rel="modulepreload" href="/docs/diffusers/v0.5.0/en/_app/chunks/CodeBlock-hf-doc-builder.js">






	<h1 class="relative group"><a id="memory-and-speed" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-and-speed"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Memory and speed
	</span></h1>

	<p>We present some techniques and ideas to optimize 🤗 Diffusers <em>inference</em> for memory or speed.</p>
	<table><thead><tr><th></th>
	<th>Latency</th>
	<th>Speedup</th></tr></thead>
	<tbody><tr><td>original</td>
	<td>9.50s</td>
	<td>x1</td></tr>
	<tr><td>cuDNN auto-tuner</td>
	<td>9.37s</td>
	<td>x1.01</td></tr>
	<tr><td>autocast (fp16)</td>
	<td>5.47s</td>
	<td>x1.91</td></tr>
	<tr><td>fp16</td>
	<td>3.61s</td>
	<td>x2.91</td></tr>
	<tr><td>channels last</td>
	<td>3.30s</td>
	<td>x2.87</td></tr>
	<tr><td>traced UNet</td>
	<td>3.21s</td>
	<td>x2.96</td></tr></tbody></table>
	<em>obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps.</em>
	<h2 class="relative group"><a id="enable-cudnn-autotuner" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#enable-cudnn-autotuner"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Enable cuDNN auto-tuner
	</span></h2>

	<p><a href="https://developer.nvidia.com/cudnn" rel="nofollow">NVIDIA cuDNN</a> supports many algorithms to compute a convolution. Autotuner runs a short benchmark and selects the kernel with the best performance on a given hardware for a given input size.</p>
	<p>Since we’re using <strong>convolutional networks</strong> (other types currently not supported), we can enable cuDNN autotuner before launching the inference by setting:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch

	torch.backends.cudnn.benchmark = <span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div>
	<h3 class="relative group"><a id="use-tf32-instead-of-fp32-on-ampere-and-later-cuda-devices" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#use-tf32-instead-of-fp32-on-ampere-and-later-cuda-devices"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Use tf32 instead of fp32 (on Ampere and later CUDA devices)
	</span></h3>

	<p>On Ampere and later CUDA devices matrix multiplications and convolutions can use the TensorFloat32 (TF32) mode for faster but slightly less accurate computations. By default PyTorch enables TF32 mode for convolutions but not matrix multiplications, and unless a network requires full float32 precision we recommend enabling this setting for matrix multiplications, too. It can significantly speed up computations with typically negligible loss of numerical accuracy. You can read more about it <a href="https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32" rel="nofollow">here</a>. All you need to do is to add this before your inference:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch

	torch.backends.cuda.matmul.allow_tf32 = <span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div>
	<h2 class="relative group"><a id="automatic-mixed-precision-amp" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#automatic-mixed-precision-amp"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Automatic mixed precision (AMP)
	</span></h2>

	<p>If you use a CUDA GPU, you can take advantage of <code>torch.autocast</code> to perform inference roughly twice as fast at the cost of slightly lower precision. All you need to do is put your inference call inside an <code>autocast</code> context manager. The following example shows how to do it using Stable Diffusion text-to-image generation as an example:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torch <span class="hljs-keyword">import</span> autocast
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(<span class="hljs-string">"CompVis/stable-diffusion-v1-4"</span>)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	<span class="hljs-keyword">with</span> autocast(<span class="hljs-string">"cuda"</span>):
	image = pipe(prompt).images[<span class="hljs-number">0</span>] <!-- HTML_TAG_END --></pre></div>
	<p>Despite the precision loss, in our experience the final image results look the same as the <code>float32</code> versions. Feel free to experiment and report back!</p>
	<h2 class="relative group"><a id="half-precision-weights" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#half-precision-weights"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Half precision weights
	</span></h2>

	<p>To save more GPU memory and get even more speed, you can load and run the model weights directly in half precision. This involves loading the float16 version of the weights, which was saved to a branch named <code>fp16</code>, and telling PyTorch to use the <code>float16</code> type when loading them:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START -->pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"CompVis/stable-diffusion-v1-4"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	image = pipe(prompt).images[<span class="hljs-number">0</span>] <!-- HTML_TAG_END --></pre></div>
	<h2 class="relative group"><a id="sliced-attention-for-additional-memory-savings" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sliced-attention-for-additional-memory-savings"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Sliced attention for additional memory savings
	</span></h2>

	<p>For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once.</p>


	<div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">Attention slicing is useful even if a batch size of just 1 is used - as long as the model uses more than one attention head. If there is more than one attention head the QK^T attention matrix can be computed sequentially for each head which can save a significant amount of memory.
	</div>
	<p>To perform the attention computation sequentially over each head, you only need to invoke <a href="/docs/diffusers/v0.5.0/en/api/pipelines/stable_diffusion#diffusers.StableDiffusionPipeline.enable_attention_slicing">enable_attention_slicing()</a> in your pipeline before inference, like here:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"CompVis/stable-diffusion-v1-4"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	pipe.enable_attention_slicing()
	image = pipe(prompt).images[<span class="hljs-number">0</span>] <!-- HTML_TAG_END --></pre></div>
	<p>There’s a small performance penalty of about 10% slower inference times, but this method allows you to use Stable Diffusion in as little as 3.2 GB of VRAM!</p>
	<h2 class="relative group"><a id="using-channels-last-memory-format" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-channels-last-memory-format"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Using Channels Last memory format
	</span></h2>

	<p>Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it’s better to try it and see if it works for your model.</p>
	<p>For example, in order to set the UNet model in our pipeline to use channels last format, we can use the following:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(pipe.unet.conv_out.state_dict()[<span class="hljs-string">"weight"</span>].stride()) <span class="hljs-comment"># (2880, 9, 3, 1)</span>
	pipe.unet.to(memory_format=torch.channels_last) <span class="hljs-comment"># in-place operation</span>
	<span class="hljs-built_in">print</span>(
	pipe.unet.conv_out.state_dict()[<span class="hljs-string">"weight"</span>].stride()
	) <span class="hljs-comment"># (2880, 1, 960, 320) having a stride of 1 for the 2nd dimension proves that it works</span><!-- HTML_TAG_END --></pre></div>
	<h2 class="relative group"><a id="tracing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tracing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Tracing
	</span></h2>

	<p>Tracing runs an example input tensor through your model, and captures the operations that are invoked as that input makes its way through the model’s layers so that an executable or <code>ScriptFunction</code> is returned that will be optimized using just-in-time compilation.</p>
	<p>To trace our UNet model, we can use the following:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> time
	<span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
	<span class="hljs-keyword">import</span> functools

	<span class="hljs-comment"># torch disable grad</span>
	torch.set_grad_enabled(<span class="hljs-literal">False</span>)

	<span class="hljs-comment"># set variables</span>
	n_experiments = <span class="hljs-number">2</span>
	unet_runs_per_experiment = <span class="hljs-number">50</span>

	<span class="hljs-comment"># load inputs</span>
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">generate_inputs</span>():
	sample = torch.randn(<span class="hljs-number">2</span>, <span class="hljs-number">4</span>, <span class="hljs-number">64</span>, <span class="hljs-number">64</span>).half().cuda()
	timestep = torch.rand(<span class="hljs-number">1</span>).half().cuda() * <span class="hljs-number">999</span>
	encoder_hidden_states = torch.randn(<span class="hljs-number">2</span>, <span class="hljs-number">77</span>, <span class="hljs-number">768</span>).half().cuda()
	<span class="hljs-keyword">return</span> sample, timestep, encoder_hidden_states


	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"CompVis/stable-diffusion-v1-4"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	).to(<span class="hljs-string">"cuda"</span>)
	unet = pipe.unet
	unet.<span class="hljs-built_in">eval</span>()
	unet.to(memory_format=torch.channels_last) <span class="hljs-comment"># use channels_last memory format</span>
	unet.forward = functools.partial(unet.forward, return_dict=<span class="hljs-literal">False</span>) <span class="hljs-comment"># set return_dict=False as default</span>

	<span class="hljs-comment"># warmup</span>
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">3</span>):
	<span class="hljs-keyword">with</span> torch.inference_mode():
	inputs = generate_inputs()
	orig_output = unet(*inputs)

	<span class="hljs-comment"># trace</span>
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"tracing.."</span>)
	unet_traced = torch.jit.trace(unet, inputs)
	unet_traced.<span class="hljs-built_in">eval</span>()
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"done tracing"</span>)


	<span class="hljs-comment"># warmup and optimize graph</span>
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">5</span>):
	<span class="hljs-keyword">with</span> torch.inference_mode():
	inputs = generate_inputs()
	orig_output = unet_traced(*inputs)


	<span class="hljs-comment"># benchmarking</span>
	<span class="hljs-keyword">with</span> torch.inference_mode():
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_experiments):
	torch.cuda.synchronize()
	start_time = time.time()
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(unet_runs_per_experiment):
	orig_output = unet_traced(*inputs)
	torch.cuda.synchronize()
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"unet traced inference took <span class="hljs-subst">{time.time() - start_time:<span class="hljs-number">.2</span>f}</span> seconds"</span>)
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_experiments):
	torch.cuda.synchronize()
	start_time = time.time()
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(unet_runs_per_experiment):
	orig_output = unet(*inputs)
	torch.cuda.synchronize()
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"unet inference took <span class="hljs-subst">{time.time() - start_time:<span class="hljs-number">.2</span>f}</span> seconds"</span>)

	<span class="hljs-comment"># save the model</span>
	unet_traced.save(<span class="hljs-string">"unet_traced.pt"</span>)<!-- HTML_TAG_END --></pre></div>
	<p>Then we can replace the <code>unet</code> attribute of the pipeline with the traced model like the following</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
	<span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> dataclasses <span class="hljs-keyword">import</span> dataclass


	<span class="hljs-meta">@dataclass</span>
	<span class="hljs-keyword">class</span> <span class="hljs-title class_">UNet2DConditionOutput</span>:
	sample: torch.FloatTensor


	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"CompVis/stable-diffusion-v1-4"</span>,
	revision=<span class="hljs-string">"fp16"</span>,
	torch_dtype=torch.float16,
	).to(<span class="hljs-string">"cuda"</span>)

	<span class="hljs-comment"># use jitted unet</span>
	unet_traced = torch.jit.load(<span class="hljs-string">"unet_traced.pt"</span>)
	<span class="hljs-comment"># del pipe.unet</span>
	<span class="hljs-keyword">class</span> <span class="hljs-title class_">TracedUNet</span>(torch.nn.Module):
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>):
	<span class="hljs-built_in">super</span>().__init__()
	self.in_channels = pipe.unet.in_channels
	self.device = pipe.unet.device

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">self, latent_model_input, t, encoder_hidden_states</span>):
	sample = unet_traced(latent_model_input, t, encoder_hidden_states)[<span class="hljs-number">0</span>]
	<span class="hljs-keyword">return</span> UNet2DConditionOutput(sample=sample)


	pipe.unet = TracedUNet()

	<span class="hljs-keyword">with</span> torch.inference_mode():
	image = pipe([prompt] * <span class="hljs-number">1</span>, num_inference_steps=<span class="hljs-number">50</span>).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div>


	<script type="module" data-hydrate="1u8gjos">
	import { start } from "/docs/diffusers/v0.5.0/en/_app/start-hf-doc-builder.js";
	start({
	target: document.querySelector('[data-hydrate="1u8gjos"]').parentNode,
	paths: {"base":"/docs/diffusers/v0.5.0/en","assets":"/docs/diffusers/v0.5.0/en"},
	session: {},
	route: false,
	spa: false,
	trailing_slash: "never",
	hydrate: {
	status: 200,
	error: null,
	nodes: [
	import("/docs/diffusers/v0.5.0/en/_app/pages/__layout.svelte-hf-doc-builder.js"),
	import("/docs/diffusers/v0.5.0/en/_app/pages/optimization/fp16.mdx-hf-doc-builder.js")
	],
	params: {}
	}
	});
	</script>

Xet Storage Details

Size:: 35.2 kB
Xet hash:: 50dc18e220fdd08865c141187d704d6e802b9b256303ce0a0a5ff34d8ce443e3

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.