Buckets:

rtrm's picture
download
raw
29.6 kB
<meta charset="utf-8" /><meta http-equiv="content-security-policy" content=""><meta name="hf:doc:metadata" content="{&quot;local&quot;:&quot;torch20-support-in-diffusers&quot;,&quot;sections&quot;:[{&quot;local&quot;:&quot;installation&quot;,&quot;title&quot;:&quot;Installation&quot;},{&quot;local&quot;:&quot;using-efficient-attention-and-torchcompile&quot;,&quot;title&quot;:&quot;Using efficient attention and torch.compile.&quot;},{&quot;local&quot;:&quot;benchmark&quot;,&quot;sections&quot;:[{&quot;local&quot;:&quot;fp16-benchmark&quot;,&quot;title&quot;:&quot;FP16 benchmark&quot;},{&quot;local&quot;:&quot;fp32-benchmark&quot;,&quot;title&quot;:&quot;FP32 benchmark&quot;}],&quot;title&quot;:&quot;Benchmark&quot;}],&quot;title&quot;:&quot;Torch2.0 support in Diffusers&quot;}" data-svelte="svelte-1phssyn">
<link rel="modulepreload" href="/docs/diffusers/v0.13.0/en/_app/assets/pages/__layout.svelte-hf-doc-builder.css">
<link rel="modulepreload" href="/docs/diffusers/v0.13.0/en/_app/start-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.13.0/en/_app/chunks/vendor-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.13.0/en/_app/chunks/paths-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.13.0/en/_app/pages/__layout.svelte-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.13.0/en/_app/pages/optimization/torch2.0.mdx-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.13.0/en/_app/chunks/IconCopyLink-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.13.0/en/_app/chunks/CodeBlock-hf-doc-builder.js">
<h1 class="relative group"><a id="torch20-support-in-diffusers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#torch20-support-in-diffusers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>Torch2.0 support in Diffusers
</span></h1>
<p>Starting from version <code>0.13.0</code>, Diffusers supports the latest optimization from the upcoming <a href="https://pytorch.org/get-started/pytorch-2.0/" rel="nofollow">PyTorch 2.0</a> release. These include:</p>
<ol><li>Support for native flash and memory-efficient attention without any extra dependencies.</li>
<li><a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a> support for compiling individual models for extra performance boost.</li></ol>
<h2 class="relative group"><a id="installation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#installation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>Installation
</span></h2>
To benefit from the native efficient attention and `torch.compile`, we will need to install the nightly version of PyTorch as the stable version is yet to be released. The first step is to install CUDA11.7 or CUDA11.8,
as torch2.0 does not support the previous versions. Once CUDA is installed, torch nightly can be installed using:
<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
Copied</div></button></div>
<pre><!-- HTML_TAG_START -->pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu117<!-- HTML_TAG_END --></pre></div>
<h2 class="relative group"><a id="using-efficient-attention-and-torchcompile" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-efficient-attention-and-torchcompile"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>Using efficient attention and torch.compile.
</span></h2>
<ol><li><p><strong>Efficient Attention</strong></p>
<p>Efficient attention is implemented via the <a href="https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention" rel="nofollow"><code>torch.nn.functional.scaled_dot_product_attention</code></a> function, which automatically enables flash/memory efficient attention, depending on the input and the GPU type. This is the same as the <code>memory_efficient_attention</code> from <a href="https://github.com/facebookresearch/xformers" rel="nofollow">xFormers</a> but built natively into PyTorch. </p>
<p>Efficient attention will be enabled by default in Diffusers if torch2.0 is installed and if <code>torch.nn.functional.scaled_dot_product_attention</code> is available. To use it, you can install torch2.0 as suggested above and use the pipeline. For example:</p>
<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
Copied</div></button></div>
<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained(<span class="hljs-string">&quot;runwayml/stable-diffusion-v1-5&quot;</span>, torch_dtype=torch.float16)
pipe = pipe.to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;a photo of an astronaut riding a horse on mars&quot;</span>
image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div>
<p>If you want to enable it explicitly (which is not required), you can do so as shown below.</p>
<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
Copied</div></button></div>
<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
<span class="hljs-keyword">from</span> diffusers.models.cross_attention <span class="hljs-keyword">import</span> AttnProcessor2_0
pipe = StableDiffusionPipeline.from_pretrained(<span class="hljs-string">&quot;runwayml/stable-diffusion-v1-5&quot;</span>, torch_dtype=torch.float16).to(<span class="hljs-string">&quot;cuda&quot;</span>)
pipe.unet.set_attn_processor(AttnProcessor2_0())
prompt = <span class="hljs-string">&quot;a photo of an astronaut riding a horse on mars&quot;</span>
image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div>
<p>This should be as fast and memory efficient as <code>xFormers</code>.</p></li></ol>
<ol start="2"><li><p><strong>torch.compile</strong></p>
<p>To get an additional speedup, we can use the new <code>torch.compile</code> feature. To do so, we wrap our <code>unet</code> with <code>torch.compile</code>. For more information and different options, refer to the
<a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch compile docs</a>.</p>
<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
Copied</div></button></div>
<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained(<span class="hljs-string">&quot;runwayml/stable-diffusion-v1-5&quot;</span>, torch_dtype=torch.float16).to(
<span class="hljs-string">&quot;cuda&quot;</span>
)
pipe.unet = torch.<span class="hljs-built_in">compile</span>(pipe.unet)
batch_size = <span class="hljs-number">10</span>
prompt = <span class="hljs-string">&quot;A photo of an astronaut riding a horse on marse.&quot;</span>
images = pipe(prompt, num_inference_steps=steps, num_images_per_prompt=batch_size).images<!-- HTML_TAG_END --></pre></div>
<p>Depending on the type of GPU it can give between 2-9% speed-up over efficient attention. But note that as of now the speed-up is mostly noticeable on the more recent GPU architectures, such as in the A100.</p>
<p>Note that compilation will also take some time to complete, so it is best suited for situations where you need to prepare your pipeline once and then perform the same type of inference operations multiple times.</p></li></ol>
<h2 class="relative group"><a id="benchmark" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#benchmark"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>Benchmark
</span></h2>
<p>We conducted a simple benchmark on different GPUs to compare vanilla attention, xFormers, <code>torch.nn.functional.scaled_dot_product_attention</code> and <code>torch.compile+torch.nn.functional.scaled_dot_product_attention</code>.
For the benchmark we used the the <a href="https://huggingface.co/CompVis/stable-diffusion-v1-4" rel="nofollow">stable-diffusion-v1-4</a> model with 50 steps. <code>xFormers</code> benchmark is done using the <code>torch==1.13.1</code> version. The table below summarizes the result that we got.
The <code>Speed over xformers</code> columns denotes the speed-up gained over <code>xFormers</code> using the <code>torch.compile+torch.nn.functional.scaled_dot_product_attention</code>.</p>
<h3 class="relative group"><a id="fp16-benchmark" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fp16-benchmark"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>FP16 benchmark
</span></h3>
<p>The table below shows the benchmark results for inference using <code>fp16</code>. As we can see, <code>torch.nn.functional.scaled_dot_product_attention</code> is as fast as <code>xFormers</code> (sometimes slightly faster/slower) on all the GPUs we tested.
And using <code>torch.compile</code> gives further speed-up up to 10% over <code>xFormers</code>, but it’s mostly noticeable on the A100 GPU.</p>
<p><strong><em>The time reported is in seconds.</em></strong></p>
<table><thead><tr><th>GPU</th>
<th>Batch Size</th>
<th>Vanilla Attention</th>
<th>xFormers</th>
<th>PyTorch2.0 SDPA</th>
<th>SDPA + torch.compile</th>
<th>Speed over xformers (%)</th></tr></thead>
<tbody><tr><td>A100</td>
<td>10</td>
<td>12.02</td>
<td>8.7</td>
<td>8.79</td>
<td>7.89</td>
<td>9.31</td></tr>
<tr><td>A100</td>
<td>16</td>
<td>18.95</td>
<td>13.57</td>
<td>13.67</td>
<td>12.25</td>
<td>9.73</td></tr>
<tr><td>A100</td>
<td>32 (1)</td>
<td>OOM</td>
<td>26.56</td>
<td>26.68</td>
<td>24.08</td>
<td>9.34</td></tr>
<tr><td>A100</td>
<td>64(2)</td>
<td></td>
<td>52.51</td>
<td>53.03</td>
<td>47.81</td>
<td>8.95</td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>A10</td>
<td>4</td>
<td>13.94</td>
<td>9.81</td>
<td>10.01</td>
<td>9.35</td>
<td>4.69</td></tr>
<tr><td>A10</td>
<td>8</td>
<td>27.09</td>
<td>19</td>
<td>19.53</td>
<td>18.33</td>
<td>3.53</td></tr>
<tr><td>A10</td>
<td>10</td>
<td>33.69</td>
<td>23.53</td>
<td>24.19</td>
<td>22.52</td>
<td>4.29</td></tr>
<tr><td>A10</td>
<td>16</td>
<td>OOM</td>
<td>37.55</td>
<td>38.31</td>
<td>36.81</td>
<td>1.97</td></tr>
<tr><td>A10</td>
<td>32 (1)</td>
<td></td>
<td>77.19</td>
<td>78.43</td>
<td>76.64</td>
<td>0.71</td></tr>
<tr><td>A10</td>
<td>64 (1)</td>
<td></td>
<td>173.59</td>
<td>158.99</td>
<td>155.14</td>
<td>10.63</td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>T4</td>
<td>4</td>
<td>38.81</td>
<td>30.09</td>
<td>29.74</td>
<td>27.55</td>
<td>8.44</td></tr>
<tr><td>T4</td>
<td>8</td>
<td>OOM</td>
<td>55.71</td>
<td>55.99</td>
<td>53.85</td>
<td>3.34</td></tr>
<tr><td>T4</td>
<td>10</td>
<td>OOM</td>
<td>68.96</td>
<td>69.86</td>
<td>65.35</td>
<td>5.23</td></tr>
<tr><td>T4</td>
<td>16</td>
<td>OOM</td>
<td>111.47</td>
<td>113.26</td>
<td>106.93</td>
<td>4.07</td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>V100</td>
<td>4</td>
<td>9.84</td>
<td>8.16</td>
<td>8.09</td>
<td>7.65</td>
<td>6.25</td></tr>
<tr><td>V100</td>
<td>8</td>
<td>OOM</td>
<td>15.62</td>
<td>15.44</td>
<td>14.59</td>
<td>6.59</td></tr>
<tr><td>V100</td>
<td>10</td>
<td>OOM</td>
<td>19.52</td>
<td>19.28</td>
<td>18.18</td>
<td>6.86</td></tr>
<tr><td>V100</td>
<td>16</td>
<td>OOM</td>
<td>30.29</td>
<td>29.84</td>
<td>28.22</td>
<td>6.83</td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>3090</td>
<td>4</td>
<td>10.04</td>
<td>7.82</td>
<td>7.89</td>
<td>7.47</td>
<td>4.48</td></tr>
<tr><td>3090</td>
<td>8</td>
<td>19.27</td>
<td>14.97</td>
<td>15.04</td>
<td>14.22</td>
<td>5.01</td></tr>
<tr><td>3090</td>
<td>10</td>
<td>24.08</td>
<td>18.7</td>
<td>18.7</td>
<td>17.69</td>
<td>5.40</td></tr>
<tr><td>3090</td>
<td>16</td>
<td>OOM</td>
<td>29.06</td>
<td>29.06</td>
<td>28.2</td>
<td>2.96</td></tr>
<tr><td>3090</td>
<td>32 (1)</td>
<td></td>
<td>58.05</td>
<td>58</td>
<td>54.88</td>
<td>5.46</td></tr>
<tr><td>3090</td>
<td>64 (1)</td>
<td></td>
<td>126.54</td>
<td>126.03</td>
<td>117.33</td>
<td>7.28</td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>3090 Ti</td>
<td>4</td>
<td>9.07</td>
<td>7.14</td>
<td>7.15</td>
<td>6.81</td>
<td>4.62</td></tr>
<tr><td>3090 Ti</td>
<td>8</td>
<td>17.51</td>
<td>13.65</td>
<td>13.72</td>
<td>12.99</td>
<td>4.84</td></tr>
<tr><td>3090 Ti</td>
<td>10 (2)</td>
<td>21.79</td>
<td>16.85</td>
<td>16.93</td>
<td>16.02</td>
<td>4.93</td></tr>
<tr><td>3090 Ti</td>
<td>16</td>
<td>OOM</td>
<td>26.1</td>
<td>26.28</td>
<td>25.46</td>
<td>2.45</td></tr>
<tr><td>3090 Ti</td>
<td>32 (1)</td>
<td></td>
<td>51.78</td>
<td>52.04</td>
<td>49.15</td>
<td>5.08</td></tr>
<tr><td>3090 Ti</td>
<td>64 (1)</td>
<td></td>
<td>112.02</td>
<td>112.33</td>
<td>103.91</td>
<td>7.24</td></tr></tbody></table>
<h3 class="relative group"><a id="fp32-benchmark" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fp32-benchmark"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>FP32 benchmark
</span></h3>
<p>The table below shows the benchmark results for inference using <code>fp32</code>. As we can see, <code>torch.nn.functional.scaled_dot_product_attention</code> is as fast as <code>xFormers</code> (sometimes slightly faster/slower) on all the GPUs we tested.
Using <code>torch.compile</code> with efficient attention gives up to 18% performance improvement over <code>xFormers</code> in Ampere cards, and up to 20% over vanilla attention.</p>
<table><thead><tr><th>GPU</th>
<th>Batch Size</th>
<th>Vanilla Attention</th>
<th>xFormers</th>
<th>PyTorch2.0 SDPA</th>
<th>SDPA + torch.compile</th>
<th>Speed over xformers (%)</th>
<th>Speed over vanilla (%)</th></tr></thead>
<tbody><tr><td>A100</td>
<td>4</td>
<td>16.56</td>
<td>12.42</td>
<td>12.2</td>
<td>11.84</td>
<td>4.67</td>
<td>28.50</td></tr>
<tr><td>A100</td>
<td>10</td>
<td>OOM</td>
<td>29.93</td>
<td>29.44</td>
<td>28.5</td>
<td>4.78</td>
<td></td></tr>
<tr><td>A100</td>
<td>16</td>
<td></td>
<td>47.08</td>
<td>46.27</td>
<td>44.8</td>
<td>4.84</td>
<td></td></tr>
<tr><td>A100</td>
<td>32</td>
<td></td>
<td>92.89</td>
<td>91.34</td>
<td>88.35</td>
<td>4.89</td>
<td></td></tr>
<tr><td>A100</td>
<td>64</td>
<td></td>
<td>185.3</td>
<td>182.71</td>
<td>176.48</td>
<td>4.76</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>A10</td>
<td>1</td>
<td>10.59</td>
<td>8.81</td>
<td>7.51</td>
<td>7.35</td>
<td>16.57</td>
<td>30.59</td></tr>
<tr><td>A10</td>
<td>4</td>
<td>34.77</td>
<td>27.63</td>
<td>22.77</td>
<td>22.07</td>
<td>20.12</td>
<td>36.53</td></tr>
<tr><td>A10</td>
<td>8</td>
<td></td>
<td>56.19</td>
<td>43.53</td>
<td>43.86</td>
<td>21.94</td>
<td></td></tr>
<tr><td>A10</td>
<td>16</td>
<td></td>
<td>116.49</td>
<td>88.56</td>
<td>86.64</td>
<td>25.62</td>
<td></td></tr>
<tr><td>A10</td>
<td>32</td>
<td></td>
<td>221.95</td>
<td>175.74</td>
<td>168.18</td>
<td>24.23</td>
<td></td></tr>
<tr><td>A10</td>
<td>48</td>
<td></td>
<td>333.23</td>
<td>264.84</td>
<td></td>
<td>20.52</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>T4</td>
<td>1</td>
<td>28.2</td>
<td>24.49</td>
<td>23.93</td>
<td>23.56</td>
<td>3.80</td>
<td>16.45</td></tr>
<tr><td>T4</td>
<td>2</td>
<td>52.77</td>
<td>45.7</td>
<td>45.88</td>
<td>45.06</td>
<td>1.40</td>
<td>14.61</td></tr>
<tr><td>T4</td>
<td>4</td>
<td>OOM</td>
<td>85.72</td>
<td>85.78</td>
<td>84.48</td>
<td>1.45</td>
<td></td></tr>
<tr><td>T4</td>
<td>8</td>
<td></td>
<td>149.64</td>
<td>150.75</td>
<td>148.4</td>
<td>0.83</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>V100</td>
<td>1</td>
<td>7.4</td>
<td>6.84</td>
<td>6.8</td>
<td>6.66</td>
<td>2.63</td>
<td>10.00</td></tr>
<tr><td>V100</td>
<td>2</td>
<td>13.85</td>
<td>12.81</td>
<td>12.66</td>
<td>12.35</td>
<td>3.59</td>
<td>10.83</td></tr>
<tr><td>V100</td>
<td>4</td>
<td>OOM</td>
<td>25.73</td>
<td>25.31</td>
<td>24.78</td>
<td>3.69</td>
<td></td></tr>
<tr><td>V100</td>
<td>8</td>
<td></td>
<td>43.95</td>
<td>43.37</td>
<td>42.25</td>
<td>3.87</td>
<td></td></tr>
<tr><td>V100</td>
<td>16</td>
<td></td>
<td>84.99</td>
<td>84.73</td>
<td>82.55</td>
<td>2.87</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>3090</td>
<td>1</td>
<td>7.09</td>
<td>6.78</td>
<td>6.11</td>
<td>6.03</td>
<td>11.06</td>
<td>14.95</td></tr>
<tr><td>3090</td>
<td>4</td>
<td>22.69</td>
<td>21.45</td>
<td>18.67</td>
<td>18.09</td>
<td>15.66</td>
<td>20.27</td></tr>
<tr><td>3090</td>
<td>8 (2)</td>
<td></td>
<td>42.59</td>
<td>36.75</td>
<td>35.59</td>
<td>16.44</td>
<td></td></tr>
<tr><td>3090</td>
<td>16</td>
<td></td>
<td>85.35</td>
<td>72.37</td>
<td>70.25</td>
<td>17.69</td>
<td></td></tr>
<tr><td>3090</td>
<td>32 (1)</td>
<td></td>
<td>162.05</td>
<td>138.99</td>
<td>134.53</td>
<td>16.98</td>
<td></td></tr>
<tr><td>3090</td>
<td>48</td>
<td></td>
<td>241.91</td>
<td>207.75</td>
<td></td>
<td>14.12</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>3090 Ti</td>
<td>1</td>
<td>6.45</td>
<td>6.19</td>
<td>5.64</td>
<td>5.49</td>
<td>11.31</td>
<td>14.88</td></tr>
<tr><td>3090 Ti</td>
<td>4</td>
<td>20.32</td>
<td>19.31</td>
<td>16.9</td>
<td>16.37</td>
<td>15.23</td>
<td>19.44</td></tr>
<tr><td>3090 Ti</td>
<td>8 (2)</td>
<td></td>
<td>37.93</td>
<td>33.05</td>
<td>31.99</td>
<td>15.66</td>
<td></td></tr>
<tr><td>3090 Ti</td>
<td>16</td>
<td></td>
<td>75.37</td>
<td>65.25</td>
<td>64.32</td>
<td>14.66</td>
<td></td></tr>
<tr><td>3090 Ti</td>
<td>32 (1)</td>
<td></td>
<td>142.55</td>
<td>124.44</td>
<td>120.74</td>
<td>15.30</td>
<td></td></tr>
<tr><td>3090 Ti</td>
<td>48</td>
<td></td>
<td>213.19</td>
<td>186.55</td>
<td></td>
<td>12.50</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>4090</td>
<td>1</td>
<td>5.54</td>
<td>4.99</td>
<td>4.51</td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>4090</td>
<td>4</td>
<td>13.67</td>
<td>11.4</td>
<td>10.3</td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>4090</td>
<td>8 (2)</td>
<td></td>
<td>19.79</td>
<td>17.13</td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>4090</td>
<td>16</td>
<td></td>
<td>38.62</td>
<td>33.14</td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>4090</td>
<td>32 (1)</td>
<td></td>
<td>76.57</td>
<td>65.96</td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>4090</td>
<td>48</td>
<td></td>
<td>114.44</td>
<td>98.78</td>
<td></td>
<td></td>
<td></td></tr></tbody></table>
<p>(1) Batch Size &gt;= 32 requires enable_vae_slicing() because of <a href="https://github.com/pytorch/pytorch/issues/81665" rel="nofollow">https://github.com/pytorch/pytorch/issues/81665</a>
This is required for PyTorch 1.13.1, and also for PyTorch 2.0 and batch size of 64</p>
<p>For more details about how this benchmark was run, please refer to <a href="https://github.com/huggingface/diffusers/pull/2303" rel="nofollow">this PR</a>. </p>
<script type="module" data-hydrate="nqyey7">
import { start } from "/docs/diffusers/v0.13.0/en/_app/start-hf-doc-builder.js";
start({
target: document.querySelector('[data-hydrate="nqyey7"]').parentNode,
paths: {"base":"/docs/diffusers/v0.13.0/en","assets":"/docs/diffusers/v0.13.0/en"},
session: {},
route: false,
spa: false,
trailing_slash: "never",
hydrate: {
status: 200,
error: null,
nodes: [
import("/docs/diffusers/v0.13.0/en/_app/pages/__layout.svelte-hf-doc-builder.js"),
import("/docs/diffusers/v0.13.0/en/_app/pages/optimization/torch2.0.mdx-hf-doc-builder.js")
],
params: {}
}
});
</script>

Xet Storage Details

Size:
29.6 kB
·
Xet hash:
ebe1a4f481a6890548d8099b0bd3db2f21b29bf3f2f60d91351bd97e01299184

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.