Buckets:

rtrm's picture
download
raw
30.8 kB
<meta charset="utf-8" /><meta http-equiv="content-security-policy" content=""><meta name="hf:doc:metadata" content="{&quot;local&quot;:&quot;accelerated-pytorch-20-support-in-diffusers&quot;,&quot;sections&quot;:[{&quot;local&quot;:&quot;installation&quot;,&quot;title&quot;:&quot;Installation&quot;},{&quot;local&quot;:&quot;using-accelerated-transformers-and-torchcompile&quot;,&quot;title&quot;:&quot;Using accelerated transformers and torch.compile.&quot;},{&quot;local&quot;:&quot;benchmark&quot;,&quot;sections&quot;:[{&quot;local&quot;:&quot;fp16-benchmark&quot;,&quot;title&quot;:&quot;FP16 benchmark&quot;},{&quot;local&quot;:&quot;fp32-benchmark&quot;,&quot;title&quot;:&quot;FP32 benchmark&quot;}],&quot;title&quot;:&quot;Benchmark&quot;}],&quot;title&quot;:&quot;Accelerated PyTorch 2.0 support in Diffusers&quot;}" data-svelte="svelte-1phssyn">
<link rel="modulepreload" href="/docs/diffusers/v0.14.0/en/_app/assets/pages/__layout.svelte-hf-doc-builder.css">
<link rel="modulepreload" href="/docs/diffusers/v0.14.0/en/_app/start-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.14.0/en/_app/chunks/vendor-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.14.0/en/_app/chunks/paths-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.14.0/en/_app/pages/__layout.svelte-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.14.0/en/_app/pages/optimization/torch2.0.mdx-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.14.0/en/_app/chunks/IconCopyLink-hf-doc-builder.js">
<link rel="modulepreload" href="/docs/diffusers/v0.14.0/en/_app/chunks/CodeBlock-hf-doc-builder.js">
<h1 class="relative group"><a id="accelerated-pytorch-20-support-in-diffusers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#accelerated-pytorch-20-support-in-diffusers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>Accelerated PyTorch 2.0 support in Diffusers
</span></h1>
<p>Starting from version <code>0.13.0</code>, Diffusers supports the latest optimization from the upcoming <a href="https://pytorch.org/get-started/pytorch-2.0/" rel="nofollow">PyTorch 2.0</a> release. These include:</p>
<ol><li>Support for accelerated transformers implementation with memory-efficient attention – no extra dependencies required.</li>
<li><a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a> support for extra performance boost when individual models are compiled.</li></ol>
<h2 class="relative group"><a id="installation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#installation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>Installation
</span></h2>
To benefit from the accelerated transformers implementation and `torch.compile`, we will need to install the nightly version of PyTorch, as the stable version is yet to be released. The first step is to install CUDA 11.7 or CUDA 11.8,
as PyTorch 2.0 does not support the previous versions. Once CUDA is installed, torch nightly can be installed using:
<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
Copied</div></button></div>
<pre><!-- HTML_TAG_START -->pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu117<!-- HTML_TAG_END --></pre></div>
<h2 class="relative group"><a id="using-accelerated-transformers-and-torchcompile" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-accelerated-transformers-and-torchcompile"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>Using accelerated transformers and torch.compile.
</span></h2>
<ol><li><p><strong>Accelerated Transformers implementation</strong></p>
<p>PyTorch 2.0 includes an optimized and memory-efficient attention implementation through the <a href="https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention" rel="nofollow"><code>torch.nn.functional.scaled_dot_product_attention</code></a> function, which automatically enables several optimizations depending on the inputs and the GPU type. This is similar to the <code>memory_efficient_attention</code> from <a href="https://github.com/facebookresearch/xformers" rel="nofollow">xFormers</a>, but built natively into PyTorch. </p>
<p>These optimizations will be enabled by default in Diffusers if PyTorch 2.0 is installed and if <code>torch.nn.functional.scaled_dot_product_attention</code> is available. To use it, just install <code>torch 2.0</code> as suggested above and simply use the pipeline. For example:</p>
<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
Copied</div></button></div>
<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained(<span class="hljs-string">&quot;runwayml/stable-diffusion-v1-5&quot;</span>, torch_dtype=torch.float16)
pipe = pipe.to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;a photo of an astronaut riding a horse on mars&quot;</span>
image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div>
<p>If you want to enable it explicitly (which is not required), you can do so as shown below.</p>
<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
Copied</div></button></div>
<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
<span class="hljs-keyword">from</span> diffusers.models.cross_attention <span class="hljs-keyword">import</span> AttnProcessor2_0
pipe = StableDiffusionPipeline.from_pretrained(<span class="hljs-string">&quot;runwayml/stable-diffusion-v1-5&quot;</span>, torch_dtype=torch.float16).to(<span class="hljs-string">&quot;cuda&quot;</span>)
pipe.unet.set_attn_processor(AttnProcessor2_0())
prompt = <span class="hljs-string">&quot;a photo of an astronaut riding a horse on mars&quot;</span>
image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div>
<p>This should be as fast and memory efficient as <code>xFormers</code>. More details <a href="#benchmark">in our benchmark</a>.</p></li></ol>
<ol start="2"><li><p><strong>torch.compile</strong></p>
<p>To get an additional speedup, we can use the new <code>torch.compile</code> feature. To do so, we simply wrap our <code>unet</code> with <code>torch.compile</code>. For more information and different options, refer to the
<a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch compile docs</a>.</p>
<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
Copied</div></button></div>
<pre><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained(<span class="hljs-string">&quot;runwayml/stable-diffusion-v1-5&quot;</span>, torch_dtype=torch.float16).to(
<span class="hljs-string">&quot;cuda&quot;</span>
)
pipe.unet = torch.<span class="hljs-built_in">compile</span>(pipe.unet)
batch_size = <span class="hljs-number">10</span>
prompt = <span class="hljs-string">&quot;A photo of an astronaut riding a horse on marse.&quot;</span>
images = pipe(prompt, num_inference_steps=steps, num_images_per_prompt=batch_size).images<!-- HTML_TAG_END --></pre></div>
<p>Depending on the type of GPU, <code>compile()</code> can yield between 2-9% of <em>additional speed-up</em> over the accelerated transformer optimizations. Note, however, that compilation is able to squeeze more performance improvements in more recent GPU architectures such as Ampere (A100, 3090), Ada (4090) and Hopper (H100).</p>
<p>Compilation takes some time to complete, so it is best suited for situations where you need to prepare your pipeline once and then perform the same type of inference operations multiple times.</p></li></ol>
<h2 class="relative group"><a id="benchmark" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#benchmark"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>Benchmark
</span></h2>
<p>We conducted a simple benchmark on different GPUs to compare vanilla attention, xFormers, <code>torch.nn.functional.scaled_dot_product_attention</code> and <code>torch.compile+torch.nn.functional.scaled_dot_product_attention</code>.
For the benchmark we used the the <a href="https://huggingface.co/CompVis/stable-diffusion-v1-4" rel="nofollow">stable-diffusion-v1-4</a> model with 50 steps. The <code>xFormers</code> benchmark is done using the <code>torch==1.13.1</code> version, while the accelerated transformers optimizations are tested using nightly versions of PyTorch 2.0. The tables below summarize the results we got.</p>
<p>The <code>Speed over xformers</code> columns denote the speed-up gained over <code>xFormers</code> using the <code>torch.compile+torch.nn.functional.scaled_dot_product_attention</code>.</p>
<h3 class="relative group"><a id="fp16-benchmark" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fp16-benchmark"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>FP16 benchmark
</span></h3>
<p>The table below shows the benchmark results for inference using <code>fp16</code>. As we can see, <code>torch.nn.functional.scaled_dot_product_attention</code> is as fast as <code>xFormers</code> (sometimes slightly faster/slower) on all the GPUs we tested.
And using <code>torch.compile</code> gives further speed-up of up of 10% over <code>xFormers</code>, but it’s mostly noticeable on the A100 GPU.</p>
<p><strong><em>The time reported is in seconds.</em></strong></p>
<table><thead><tr><th>GPU</th>
<th>Batch Size</th>
<th>Vanilla Attention</th>
<th>xFormers</th>
<th>PyTorch2.0 SDPA</th>
<th>SDPA + torch.compile</th>
<th>Speed over xformers (%)</th></tr></thead>
<tbody><tr><td>A100</td>
<td>10</td>
<td>12.02</td>
<td>8.7</td>
<td>8.79</td>
<td>7.89</td>
<td>9.31</td></tr>
<tr><td>A100</td>
<td>16</td>
<td>18.95</td>
<td>13.57</td>
<td>13.67</td>
<td>12.25</td>
<td>9.73</td></tr>
<tr><td>A100</td>
<td>32 (1)</td>
<td>OOM</td>
<td>26.56</td>
<td>26.68</td>
<td>24.08</td>
<td>9.34</td></tr>
<tr><td>A100</td>
<td>64</td>
<td></td>
<td>52.51</td>
<td>53.03</td>
<td>47.81</td>
<td>8.95</td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>A10</td>
<td>4</td>
<td>13.94</td>
<td>9.81</td>
<td>10.01</td>
<td>9.35</td>
<td>4.69</td></tr>
<tr><td>A10</td>
<td>8</td>
<td>27.09</td>
<td>19</td>
<td>19.53</td>
<td>18.33</td>
<td>3.53</td></tr>
<tr><td>A10</td>
<td>10</td>
<td>33.69</td>
<td>23.53</td>
<td>24.19</td>
<td>22.52</td>
<td>4.29</td></tr>
<tr><td>A10</td>
<td>16</td>
<td>OOM</td>
<td>37.55</td>
<td>38.31</td>
<td>36.81</td>
<td>1.97</td></tr>
<tr><td>A10</td>
<td>32 (1)</td>
<td></td>
<td>77.19</td>
<td>78.43</td>
<td>76.64</td>
<td>0.71</td></tr>
<tr><td>A10</td>
<td>64 (1)</td>
<td></td>
<td>173.59</td>
<td>158.99</td>
<td>155.14</td>
<td>10.63</td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>T4</td>
<td>4</td>
<td>38.81</td>
<td>30.09</td>
<td>29.74</td>
<td>27.55</td>
<td>8.44</td></tr>
<tr><td>T4</td>
<td>8</td>
<td>OOM</td>
<td>55.71</td>
<td>55.99</td>
<td>53.85</td>
<td>3.34</td></tr>
<tr><td>T4</td>
<td>10</td>
<td>OOM</td>
<td>68.96</td>
<td>69.86</td>
<td>65.35</td>
<td>5.23</td></tr>
<tr><td>T4</td>
<td>16</td>
<td>OOM</td>
<td>111.47</td>
<td>113.26</td>
<td>106.93</td>
<td>4.07</td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>V100</td>
<td>4</td>
<td>9.84</td>
<td>8.16</td>
<td>8.09</td>
<td>7.65</td>
<td>6.25</td></tr>
<tr><td>V100</td>
<td>8</td>
<td>OOM</td>
<td>15.62</td>
<td>15.44</td>
<td>14.59</td>
<td>6.59</td></tr>
<tr><td>V100</td>
<td>10</td>
<td>OOM</td>
<td>19.52</td>
<td>19.28</td>
<td>18.18</td>
<td>6.86</td></tr>
<tr><td>V100</td>
<td>16</td>
<td>OOM</td>
<td>30.29</td>
<td>29.84</td>
<td>28.22</td>
<td>6.83</td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>3090</td>
<td>4</td>
<td>10.04</td>
<td>7.82</td>
<td>7.89</td>
<td>7.47</td>
<td>4.48</td></tr>
<tr><td>3090</td>
<td>8</td>
<td>19.27</td>
<td>14.97</td>
<td>15.04</td>
<td>14.22</td>
<td>5.01</td></tr>
<tr><td>3090</td>
<td>10</td>
<td>24.08</td>
<td>18.7</td>
<td>18.7</td>
<td>17.69</td>
<td>5.40</td></tr>
<tr><td>3090</td>
<td>16</td>
<td>OOM</td>
<td>29.06</td>
<td>29.06</td>
<td>28.2</td>
<td>2.96</td></tr>
<tr><td>3090</td>
<td>32 (1)</td>
<td></td>
<td>58.05</td>
<td>58</td>
<td>54.88</td>
<td>5.46</td></tr>
<tr><td>3090</td>
<td>64 (1)</td>
<td></td>
<td>126.54</td>
<td>126.03</td>
<td>117.33</td>
<td>7.28</td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>3090 Ti</td>
<td>4</td>
<td>9.07</td>
<td>7.14</td>
<td>7.15</td>
<td>6.81</td>
<td>4.62</td></tr>
<tr><td>3090 Ti</td>
<td>8</td>
<td>17.51</td>
<td>13.65</td>
<td>13.72</td>
<td>12.99</td>
<td>4.84</td></tr>
<tr><td>3090 Ti</td>
<td>10 (2)</td>
<td>21.79</td>
<td>16.85</td>
<td>16.93</td>
<td>16.02</td>
<td>4.93</td></tr>
<tr><td>3090 Ti</td>
<td>16</td>
<td>OOM</td>
<td>26.1</td>
<td>26.28</td>
<td>25.46</td>
<td>2.45</td></tr>
<tr><td>3090 Ti</td>
<td>32 (1)</td>
<td></td>
<td>51.78</td>
<td>52.04</td>
<td>49.15</td>
<td>5.08</td></tr>
<tr><td>3090 Ti</td>
<td>64 (1)</td>
<td></td>
<td>112.02</td>
<td>112.33</td>
<td>103.91</td>
<td>7.24</td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>4090</td>
<td>4</td>
<td>10.48</td>
<td>8.37</td>
<td>8.32</td>
<td>8.01</td>
<td>4.30</td></tr>
<tr><td>4090</td>
<td>8</td>
<td>14.33</td>
<td>10.22</td>
<td>10.42</td>
<td>9.78</td>
<td>4.31</td></tr>
<tr><td>4090</td>
<td>16</td>
<td></td>
<td>17.07</td>
<td>17.46</td>
<td>17.15</td>
<td>-0.47</td></tr>
<tr><td>4090</td>
<td>32 (1)</td>
<td></td>
<td>39.03</td>
<td>39.86</td>
<td>37.97</td>
<td>2.72</td></tr>
<tr><td>4090</td>
<td>64 (1)</td>
<td></td>
<td>77.29</td>
<td>79.44</td>
<td>77.67</td>
<td>-0.49</td></tr></tbody></table>
<h3 class="relative group"><a id="fp32-benchmark" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fp32-benchmark"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
<span>FP32 benchmark
</span></h3>
<p>The table below shows the benchmark results for inference using <code>fp32</code>. In this case, <code>torch.nn.functional.scaled_dot_product_attention</code> is faster than <code>xFormers</code> on all the GPUs we tested.</p>
<p>Using <code>torch.compile</code> in addition to the accelerated transformers implementation can yield up to 19% performance improvement over <code>xFormers</code> in Ampere and Ada cards, and up to 20% (Ampere) or 28% (Ada) over vanilla attention.</p>
<table><thead><tr><th>GPU</th>
<th>Batch Size</th>
<th>Vanilla Attention</th>
<th>xFormers</th>
<th>PyTorch2.0 SDPA</th>
<th>SDPA + torch.compile</th>
<th>Speed over xformers (%)</th>
<th>Speed over vanilla (%)</th></tr></thead>
<tbody><tr><td>A100</td>
<td>4</td>
<td>16.56</td>
<td>12.42</td>
<td>12.2</td>
<td>11.84</td>
<td>4.67</td>
<td>28.50</td></tr>
<tr><td>A100</td>
<td>10</td>
<td>OOM</td>
<td>29.93</td>
<td>29.44</td>
<td>28.5</td>
<td>4.78</td>
<td></td></tr>
<tr><td>A100</td>
<td>16</td>
<td></td>
<td>47.08</td>
<td>46.27</td>
<td>44.8</td>
<td>4.84</td>
<td></td></tr>
<tr><td>A100</td>
<td>32</td>
<td></td>
<td>92.89</td>
<td>91.34</td>
<td>88.35</td>
<td>4.89</td>
<td></td></tr>
<tr><td>A100</td>
<td>64</td>
<td></td>
<td>185.3</td>
<td>182.71</td>
<td>176.48</td>
<td>4.76</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>A10</td>
<td>1</td>
<td>10.59</td>
<td>8.81</td>
<td>7.51</td>
<td>7.35</td>
<td>16.57</td>
<td>30.59</td></tr>
<tr><td>A10</td>
<td>4</td>
<td>34.77</td>
<td>27.63</td>
<td>22.77</td>
<td>22.07</td>
<td>20.12</td>
<td>36.53</td></tr>
<tr><td>A10</td>
<td>8</td>
<td></td>
<td>56.19</td>
<td>43.53</td>
<td>43.86</td>
<td>21.94</td>
<td></td></tr>
<tr><td>A10</td>
<td>16</td>
<td></td>
<td>116.49</td>
<td>88.56</td>
<td>86.64</td>
<td>25.62</td>
<td></td></tr>
<tr><td>A10</td>
<td>32</td>
<td></td>
<td>221.95</td>
<td>175.74</td>
<td>168.18</td>
<td>24.23</td>
<td></td></tr>
<tr><td>A10</td>
<td>48</td>
<td></td>
<td>333.23</td>
<td>264.84</td>
<td></td>
<td>20.52</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>T4</td>
<td>1</td>
<td>28.2</td>
<td>24.49</td>
<td>23.93</td>
<td>23.56</td>
<td>3.80</td>
<td>16.45</td></tr>
<tr><td>T4</td>
<td>2</td>
<td>52.77</td>
<td>45.7</td>
<td>45.88</td>
<td>45.06</td>
<td>1.40</td>
<td>14.61</td></tr>
<tr><td>T4</td>
<td>4</td>
<td>OOM</td>
<td>85.72</td>
<td>85.78</td>
<td>84.48</td>
<td>1.45</td>
<td></td></tr>
<tr><td>T4</td>
<td>8</td>
<td></td>
<td>149.64</td>
<td>150.75</td>
<td>148.4</td>
<td>0.83</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>V100</td>
<td>1</td>
<td>7.4</td>
<td>6.84</td>
<td>6.8</td>
<td>6.66</td>
<td>2.63</td>
<td>10.00</td></tr>
<tr><td>V100</td>
<td>2</td>
<td>13.85</td>
<td>12.81</td>
<td>12.66</td>
<td>12.35</td>
<td>3.59</td>
<td>10.83</td></tr>
<tr><td>V100</td>
<td>4</td>
<td>OOM</td>
<td>25.73</td>
<td>25.31</td>
<td>24.78</td>
<td>3.69</td>
<td></td></tr>
<tr><td>V100</td>
<td>8</td>
<td></td>
<td>43.95</td>
<td>43.37</td>
<td>42.25</td>
<td>3.87</td>
<td></td></tr>
<tr><td>V100</td>
<td>16</td>
<td></td>
<td>84.99</td>
<td>84.73</td>
<td>82.55</td>
<td>2.87</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>3090</td>
<td>1</td>
<td>7.09</td>
<td>6.78</td>
<td>6.11</td>
<td>6.03</td>
<td>11.06</td>
<td>14.95</td></tr>
<tr><td>3090</td>
<td>4</td>
<td>22.69</td>
<td>21.45</td>
<td>18.67</td>
<td>18.09</td>
<td>15.66</td>
<td>20.27</td></tr>
<tr><td>3090</td>
<td>8</td>
<td></td>
<td>42.59</td>
<td>36.75</td>
<td>35.59</td>
<td>16.44</td>
<td></td></tr>
<tr><td>3090</td>
<td>16</td>
<td></td>
<td>85.35</td>
<td>72.37</td>
<td>70.25</td>
<td>17.69</td>
<td></td></tr>
<tr><td>3090</td>
<td>32 (1)</td>
<td></td>
<td>162.05</td>
<td>138.99</td>
<td>134.53</td>
<td>16.98</td>
<td></td></tr>
<tr><td>3090</td>
<td>48</td>
<td></td>
<td>241.91</td>
<td>207.75</td>
<td></td>
<td>14.12</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>3090 Ti</td>
<td>1</td>
<td>6.45</td>
<td>6.19</td>
<td>5.64</td>
<td>5.49</td>
<td>11.31</td>
<td>14.88</td></tr>
<tr><td>3090 Ti</td>
<td>4</td>
<td>20.32</td>
<td>19.31</td>
<td>16.9</td>
<td>16.37</td>
<td>15.23</td>
<td>19.44</td></tr>
<tr><td>3090 Ti</td>
<td>8 (2)</td>
<td></td>
<td>37.93</td>
<td>33.05</td>
<td>31.99</td>
<td>15.66</td>
<td></td></tr>
<tr><td>3090 Ti</td>
<td>16</td>
<td></td>
<td>75.37</td>
<td>65.25</td>
<td>64.32</td>
<td>14.66</td>
<td></td></tr>
<tr><td>3090 Ti</td>
<td>32 (1)</td>
<td></td>
<td>142.55</td>
<td>124.44</td>
<td>120.74</td>
<td>15.30</td>
<td></td></tr>
<tr><td>3090 Ti</td>
<td>48</td>
<td></td>
<td>213.19</td>
<td>186.55</td>
<td></td>
<td>12.50</td>
<td></td></tr>
<tr><td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td></tr>
<tr><td>4090</td>
<td>1</td>
<td>5.54</td>
<td>4.99</td>
<td>4.51</td>
<td>4.44</td>
<td>11.02</td>
<td>19.86</td></tr>
<tr><td>4090</td>
<td>4</td>
<td>13.67</td>
<td>11.4</td>
<td>10.3</td>
<td>9.84</td>
<td>13.68</td>
<td>28.02</td></tr>
<tr><td>4090</td>
<td>8</td>
<td></td>
<td>19.79</td>
<td>17.13</td>
<td>16.19</td>
<td>18.19</td>
<td></td></tr>
<tr><td>4090</td>
<td>16</td>
<td></td>
<td>38.62</td>
<td>33.14</td>
<td>32.31</td>
<td>16.34</td>
<td></td></tr>
<tr><td>4090</td>
<td>32 (1)</td>
<td></td>
<td>76.57</td>
<td>65.96</td>
<td>62.05</td>
<td>18.96</td>
<td></td></tr>
<tr><td>4090</td>
<td>48</td>
<td></td>
<td>114.44</td>
<td>98.78</td>
<td></td>
<td>13.68</td>
<td></td></tr></tbody></table>
<p>(1) Batch Size &gt;= 32 requires enable_vae_slicing() because of <a href="https://github.com/pytorch/pytorch/issues/81665" rel="nofollow">https://github.com/pytorch/pytorch/issues/81665</a>
This is required for PyTorch 1.13.1, and also for PyTorch 2.0 and batch size of 64</p>
<p>For more details about how this benchmark was run, please refer to <a href="https://github.com/huggingface/diffusers/pull/2303" rel="nofollow">this PR</a>. </p>
<script type="module" data-hydrate="5md4mh">
import { start } from "/docs/diffusers/v0.14.0/en/_app/start-hf-doc-builder.js";
start({
target: document.querySelector('[data-hydrate="5md4mh"]').parentNode,
paths: {"base":"/docs/diffusers/v0.14.0/en","assets":"/docs/diffusers/v0.14.0/en"},
session: {},
route: false,
spa: false,
trailing_slash: "never",
hydrate: {
status: 200,
error: null,
nodes: [
import("/docs/diffusers/v0.14.0/en/_app/pages/__layout.svelte-hf-doc-builder.js"),
import("/docs/diffusers/v0.14.0/en/_app/pages/optimization/torch2.0.mdx-hf-doc-builder.js")
],
params: {}
}
});
</script>

Xet Storage Details

Size:
30.8 kB
·
Xet hash:
cb6103c407ddea192998b42efe1b0f2661ab631f0dd9bdc8cc9f8e49b6cd5a72

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.