Buckets:

rtrm's picture
download
raw
44 kB
import{s as Rl,o as kl,n as x}from"../chunks/scheduler.5c93273d.js";import{S as Il,i as Vl,g as f,s as a,r as h,A as Ll,h as r,f as l,c as p,j as Cl,u as d,x as u,k as Xl,y as Hl,a as s,v as M,d as b,t as $,w}from"../chunks/index.e43dd92b.js";import{T as Ve}from"../chunks/Tip.1cbfe904.js";import{C as G}from"../chunks/CodeBlock.6896320e.js";import{H as W,E as El}from"../chunks/getInferenceSnippets.22672bbf.js";import{H as Nl,a as At}from"../chunks/HfOption.d50154c3.js";function Ql(j){let n,T="bfloat16与float16类似,但对数值误差更稳健。硬件对bfloat16的支持各不相同,但大多数现代GPU都能支持bfloat16。",i,m,y;return m=new G({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmlsaXR5YWklMkZzdGFibGUtZGlmZnVzaW9uLXhsLWJhc2UtMS4wJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUwQSkudG8oJTIyY3VkYSUyMiklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJBc3Ryb25hdXQlMjBpbiUyMGElMjBqdW5nbGUlMkMlMjBjb2xkJTIwY29sb3IlMjBwYWxldHRlJTJDJTIwbXV0ZWQlMjBjb2xvcnMlMkMlMjBkZXRhaWxlZCUyQyUyMDhrJTIyJTBBcGlwZWxpbmUocHJvbXB0JTJDJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDMwKS5pbWFnZXMlNUIwJTVE",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),{c(){n=f("p"),n.textContent=T,i=a(),h(m.$$.fragment)},l(c){n=r(c,"P",{"data-svelte-h":!0}),u(n)!=="svelte-1u2chbj"&&(n.textContent=T),i=p(c),d(m.$$.fragment,c)},m(c,o){s(c,n,o),s(c,i,o),M(m,c,o),y=!0},p:x,i(c){y||(b(m.$$.fragment,c),y=!0)},o(c){$(m.$$.fragment,c),y=!1},d(c){c&&(l(n),l(i)),w(m,c)}}}function Fl(j){let n,T="float16与bfloat16类似,但可能更容易出现数值误差。",i,m,y;return m=new G({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmlsaXR5YWklMkZzdGFibGUtZGlmZnVzaW9uLXhsLWJhc2UtMS4wJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTBBKS50byglMjJjdWRhJTIyKSUwQSUwQXByb21wdCUyMCUzRCUyMCUyMkFzdHJvbmF1dCUyMGluJTIwYSUyMGp1bmdsZSUyQyUyMGNvbGQlMjBjb2xvciUyMHBhbGV0dGUlMkMlMjBtdXRlZCUyMGNvbG9ycyUyQyUyMGRldGFpbGVkJTJDJTIwOGslMjIlMEFwaXBlbGluZShwcm9tcHQlMkMlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMzApLmltYWdlcyU1QjAlNUQ=",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),{c(){n=f("p"),n.textContent=T,i=a(),h(m.$$.fragment)},l(c){n=r(c,"P",{"data-svelte-h":!0}),u(n)!=="svelte-1nw1ct0"&&(n.textContent=T),i=p(c),d(m.$$.fragment,c)},m(c,o){s(c,n,o),s(c,i,o),M(m,c,o),y=!0},p:x,i(c){y||(b(m.$$.fragment,c),y=!0)},o(c){$(m.$$.fragment,c),y=!1},d(c){c&&(l(n),l(i)),w(m,c)}}}function Yl(j){let n,T='<a href="https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/" rel="nofollow">TensorFloat-32 (tf32)</a>模式在NVIDIA Ampere GPU上受支持,它以tf32计算卷积和矩阵乘法运算。存储和其他操作保持在float32。与bfloat16或float16结合使用时,可以显著加快计算速度。',i,m,y="PyTorch默认仅对卷积启用tf32模式,您需要显式启用矩阵乘法的tf32模式。",c,o,U,Z,_='更多详情请参阅<a href="https://huggingface.co/docs/transformers/en/perf_train_gpu_one#mixed-precision" rel="nofollow">混合精度训练</a>文档。',B;return o=new G({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXRvcmNoLmJhY2tlbmRzLmN1ZGEubWF0bXVsLmFsbG93X3RmMzIlMjAlM0QlMjBUcnVlJTBBJTBBcGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmJmbG9hdDE2JTBBKS50byglMjJjdWRhJTIyKSUwQSUwQXByb21wdCUyMCUzRCUyMCUyMkFzdHJvbmF1dCUyMGluJTIwYSUyMGp1bmdsZSUyQyUyMGNvbGQlMjBjb2xvciUyMHBhbGV0dGUlMkMlMjBtdXRlZCUyMGNvbG9ycyUyQyUyMGRldGFpbGVkJTJDJTIwOGslMjIlMEFwaXBlbGluZShwcm9tcHQlMkMlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMzApLmltYWdlcyU1QjAlNUQ=",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
torch.backends.cuda.matmul.allow_tf32 = <span class="hljs-literal">True</span>
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),{c(){n=f("p"),n.innerHTML=T,i=a(),m=f("p"),m.textContent=y,c=a(),h(o.$$.fragment),U=a(),Z=f("p"),Z.innerHTML=_},l(J){n=r(J,"P",{"data-svelte-h":!0}),u(n)!=="svelte-o6hhje"&&(n.innerHTML=T),i=p(J),m=r(J,"P",{"data-svelte-h":!0}),u(m)!=="svelte-1p6w426"&&(m.textContent=y),c=p(J),d(o.$$.fragment,J),U=p(J),Z=r(J,"P",{"data-svelte-h":!0}),u(Z)!=="svelte-1v2g6n5"&&(Z.innerHTML=_)},m(J,g){s(J,n,g),s(J,i,g),s(J,m,g),s(J,c,g),M(o,J,g),s(J,U,g),s(J,Z,g),B=!0},p:x,i(J){B||(b(o.$$.fragment,J),B=!0)},o(J){$(o.$$.fragment,J),B=!1},d(J){J&&(l(n),l(i),l(m),l(c),l(U),l(Z)),w(o,J)}}}function Sl(j){let n,T,i,m,y,c;return n=new At({props:{id:"dtypes",option:"bfloat16",$$slots:{default:[Ql]},$$scope:{ctx:j}}}),i=new At({props:{id:"dtypes",option:"float16",$$slots:{default:[Fl]},$$scope:{ctx:j}}}),y=new At({props:{id:"dtypes",option:"TensorFloat-32",$$slots:{default:[Yl]},$$scope:{ctx:j}}}),{c(){h(n.$$.fragment),T=a(),h(i.$$.fragment),m=a(),h(y.$$.fragment)},l(o){d(n.$$.fragment,o),T=p(o),d(i.$$.fragment,o),m=p(o),d(y.$$.fragment,o)},m(o,U){M(n,o,U),s(o,T,U),M(i,o,U),s(o,m,U),M(y,o,U),c=!0},p(o,U){const Z={};U&2&&(Z.$$scope={dirty:U,ctx:o}),n.$set(Z);const _={};U&2&&(_.$$scope={dirty:U,ctx:o}),i.$set(_);const B={};U&2&&(B.$$scope={dirty:U,ctx:o}),y.$set(B)},i(o){c||(b(n.$$.fragment,o),b(i.$$.fragment,o),b(y.$$.fragment,o),c=!0)},o(o){$(n.$$.fragment,o),$(i.$$.fragment,o),$(y.$$.fragment,o),c=!1},d(o){o&&(l(T),l(m)),w(n,o),w(i,o),w(y,o)}}}function zl(j){let n,T='内存高效注意力优化了推理速度<em>和</em><a href="./memory#memory-efficient-attention">内存使用</a>!';return{c(){n=f("p"),n.innerHTML=T},l(i){n=r(i,"P",{"data-svelte-h":!0}),u(n)!=="svelte-yv527y"&&(n.innerHTML=T)},m(i,m){s(i,n,m)},p:x,d(i){i&&l(n)}}}function Al(j){let n,T='在PyTorch 2.3.1中,您可以控制torch.compile的缓存行为。这对于像<code>&quot;max-autotune&quot;</code>这样的编译模式特别有用,它会通过网格搜索多个编译标志来找到最优配置。更多详情请参阅<a href="https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html" rel="nofollow">torch.compile中的编译时间缓存</a>教程。';return{c(){n=f("p"),n.innerHTML=T},l(i){n=r(i,"P",{"data-svelte-h":!0}),u(n)!=="svelte-hnyc8w"&&(n.innerHTML=T)},m(i,m){s(i,n,m)},p:x,d(i){i&&l(n)}}}function Pl(j){let n,T="确保始终使用PyTorch的nightly版本以获得更好的支持。";return{c(){n=f("p"),n.textContent=T},l(i){n=r(i,"P",{"data-svelte-h":!0}),u(n)!=="svelte-1eyhk6t"&&(n.textContent=T)},m(i,m){s(i,n,m)},p:x,d(i){i&&l(n)}}}function ql(j){let n,T='更多区域编译示例,请参阅参考<a href="https://github.com/huggingface/diffusers/pull/11705" rel="nofollow">PR</a>。';return{c(){n=f("p"),n.innerHTML=T},l(i){n=r(i,"P",{"data-svelte-h":!0}),u(n)!=="svelte-1a0e2"&&(n.innerHTML=T)},m(i,m){s(i,n,m)},p:x,d(i){i&&l(n)}}}function Dl(j){let n,T='参阅<a href="https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/" rel="nofollow">torch.compile和Diffusers:峰值性能实践指南</a>博客文章,了解如何为扩散模型最大化<code>torch.compile</code>的性能。';return{c(){n=f("p"),n.innerHTML=T},l(i){n=r(i,"P",{"data-svelte-h":!0}),u(n)!=="svelte-fxt4zr"&&(n.innerHTML=T)},m(i,m){s(i,n,m)},p:x,d(i){i&&l(n)}}}function Kl(j){let n,T='参阅我们的<a href="../quantization/torchao">torchao</a>文档,了解更多关于如何使用Diffusers torchao集成的信息。';return{c(){n=f("p"),n.innerHTML=T},l(i){n=r(i,"P",{"data-svelte-h":!0}),u(n)!=="svelte-5evyml"&&(n.innerHTML=T)},m(i,m){s(i,n,m)},p:x,d(i){i&&l(n)}}}function Ol(j){let n,T,i,m,y,c,o,U='Diffusion模型在推理时速度较慢,因为生成是一个迭代过程,需要经过一定数量的”步数”逐步将噪声细化为图像或视频。要加速这一过程,您可以尝试使用不同的<a href="../api/schedulers/overview">调度器</a>、降低模型权重的精度以加快计算、使用更高效的内存注意力机制等方法。',Z,_,B="将这些技术组合使用,可以比单独使用任何一种技术获得更快的推理速度。",J,g,Pt="本指南将介绍如何加速推理。",He,L,Ee,H,qt="模型权重的精度和数据类型会影响推理速度,因为更高的精度需要更多内存来加载,也需要更多时间进行计算。PyTorch默认以float32或全精度加载模型权重,因此更改数据类型是快速获得更快推理速度的简单方法。",Ne,v,Qe,E,Fe,C,Ye,N,Dt='<a href="https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html" rel="nofollow">缩放点积注意力(SDPA)</a>实现了多种注意力后端,包括<a href="https://github.com/Dao-AILab/flash-attention" rel="nofollow">FlashAttention</a>、<a href="https://github.com/facebookresearch/xformers" rel="nofollow">xFormers</a>和原生C++实现。它会根据您的硬件自动选择最优的后端。',Se,Q,Kt='如果您使用的是PyTorch &gt;= 2.0,SDPA默认启用,无需对代码进行任何额外更改。不过,您也可以尝试使用其他注意力后端来自行选择。下面的示例使用<a href="https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html" rel="nofollow">torch.nn.attention.sdpa_kernel</a>上下文管理器来启用高效注意力。',ze,F,Ae,Y,Pe,S,Ot='<a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a>通过将PyTorch代码和操作编译为优化的内核来加速推理。Diffusers通常会编译计算密集型的模型,如UNet、transformer或VAE。',qe,z,el='启用以下编译器设置以获得最大速度(更多选项请参阅<a href="https://github.com/pytorch/pytorch/blob/main/torch/_inductor/config.py" rel="nofollow">完整列表</a>)。',De,A,Ke,P,tl="加载并编译UNet和VAE。有几种不同的模式可供选择,但<code>&quot;max-autotune&quot;</code>通过编译为CUDA图来优化速度。CUDA图通过单个CPU操作启动多个GPU操作,有效减少了开销。",Oe,X,et,q,ll='将内存布局更改为<a href="./memory#torchchannels_last">channels_last</a>也可以优化内存和推理速度。',tt,D,lt,K,sl="第一次编译时速度较慢,但一旦编译完成,速度会显著提升。尽量只在相同类型的推理操作上使用编译后的管道。在不同尺寸的图像上调用编译后的管道会重新触发编译,这会很慢且效率低下。",st,O,nt,R,at,ee,nl="<code>torch.compile</code>会跟踪输入形状和条件,如果这些不同,它会重新编译模型。例如,如果模型是在1024x1024分辨率的图像上编译的,而在不同分辨率的图像上使用,就会触发重新编译。",pt,te,al="为避免重新编译,添加<code>dynamic=True</code>以尝试生成更动态的内核,避免条件变化时重新编译。",it,le,ot,se,pl='指定<code>use_duck_shape=False</code>会指示编译器是否应使用相同的符号变量来表示相同大小的输入。更多详情请参阅此<a href="https://github.com/huggingface/diffusers/pull/11327#discussion_r2047659790" rel="nofollow">评论</a>。',ft,ne,il='并非所有模型都能开箱即用地从动态编译中受益,可能需要更改。参考此<a href="https://github.com/huggingface/diffusers/pull/11297/" rel="nofollow">PR</a>,它改进了<code>AuraFlowPipeline</code>的实现以受益于动态编译。',rt,ae,ol="如果动态编译对Diffusers模型的效果不如预期,请随时提出问题。",mt,pe,ct,ie,fl='<a href="https://docs.pytorch.org/tutorials/recipes/regional_compilation.html" rel="nofollow">区域编译</a>通过仅编译模型中<em>小而频繁重复的块</em>(通常是transformer层)来减少冷启动延迟,并为每个后续出现的块重用编译后的工件。对于许多diffusion架构,这提供了与全图编译相同的运行时加速,并将编译时间减少了8-10倍。',ut,oe,rl="使用<code>compile_repeated_blocks()</code>方法(一个包装<code>torch.compile</code>的辅助函数)在任何组件(如transformer模型)上,如下所示。",ht,fe,dt,re,ml="要为新模型启用区域编译,请在模型类中添加一个<code>_repeated_blocks</code>属性,包含您想要编译的块的类名(作为字符串)。",Mt,me,bt,k,$t,ce,cl='<a href="https://huggingface.co/docs/accelerate/index" rel="nofollow">Accelerate</a>中还有一个<a href="https://github.com/huggingface/accelerate/blob/273799c85d849a1954a4f2e65767216eb37fa089/src/accelerate/utils/other.py#L78" rel="nofollow">compile_regions</a>方法,可以自动选择模型中的候选块进行编译。其余图会单独编译。这对于快速实验很有用,因为您不需要设置哪些块要编译或调整编译标志。',wt,ue,Tt,he,ul="<code>compile_repeated_blocks()</code>是故意显式的。在<code>_repeated_blocks</code>中列出要重复的块,辅助函数仅编译这些块。它提供了可预测的行为,并且只需一行代码即可轻松推理缓存重用。",yt,de,Jt,Me,hl="在torch.compile中指定<code>fullgraph=True</code>非常重要,以确保底层模型中没有图中断。这使您可以充分利用torch.compile而不会降低性能。对于UNet和VAE,这会改变您访问返回变量的方式。",jt,be,Ut,$e,Zt,we,dl='每次去噪器做出预测后,调度器的<code>step()</code>函数会被<a href="https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L1228" rel="nofollow">调用</a>,并且<code>sigmas</code>变量会被<a href="https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/schedulers/scheduling_euler_discrete.py#L476" rel="nofollow">索引</a>。当放在GPU上时,这会引入延迟,因为CPU和GPU之间需要进行通信同步。当去噪器已经编译时,这一点会更加明显。',gt,Te,Ml='一般来说,<code>sigmas</code>应该<a href="https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240" rel="nofollow">保持在CPU上</a>,以避免通信同步和延迟。',Gt,I,_t,ye,Wt,Je,bl='参阅<a href="https://huggingface.co/datasets/diffusers/benchmarks" rel="nofollow">diffusers/benchmarks</a>数据集,查看编译管道的推理延迟和内存使用数据。',Bt,je,$l='<a href="https://github.com/sayakpaul/diffusers-torchao#benchmarking-results" rel="nofollow">diffusers-torchao</a>仓库还包含Flux和CogVideoX编译版本的基准测试结果。',xt,Ue,vt,Ze,wl='<a href="https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html" rel="nofollow">动态量化</a>通过降低精度以加快数学运算来提高推理速度。这种特定类型的量化在运行时根据数据确定如何缩放激活,而不是使用固定的缩放因子。因此,缩放因子与数据更准确地匹配。',Ct,ge,Tl='以下示例使用<a href="../quantization/torchao">torchao</a>库对UNet和VAE应用<a href="https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html" rel="nofollow">动态int8量化</a>。',Xt,V,Rt,Ge,yl="配置编译器标志以获得最大速度。",kt,_e,It,We,Jl='使用<a href="https://github.com/huggingface/diffusion-fast/blob/0f169640b1db106fe6a479f78c1ed3bfaeba3386/utils/pipeline_utils.py#L16" rel="nofollow">dynamic_quant_filter_fn</a>过滤掉UNet和VAE中一些不会从动态量化中受益的线性层。',Vt,Be,Lt,xe,Ht,ve,jl='<p>[!WARNING][fuse_qkv_projections](<a href="https://github.com/huggingface/diffusers/blob/58431f102cf39c3c8a569f32d71b2ea8caa461e1/src/diffusers/pipelines/pipeline_utils.py#L2034)%E6%96%B9%E6%B3%95%E6%98%AF%E5%AE%9E%E9%AA%8C%E6%80%A7%E7%9A%84%EF%BC%8C%E7%9B%AE%E5%89%8D%E4%B8%BB%E8%A6%81%E6%94%AF%E6%8C%81Stable" rel="nofollow">https://github.com/huggingface/diffusers/blob/58431f102cf39c3c8a569f32d71b2ea8caa461e1/src/diffusers/pipelines/pipeline_utils.py#L2034)方法是实验性的,目前主要支持Stable</a> Diffusion管道。参阅此<a href="https://github.com/huggingface/diffusers/pull/6179" rel="nofollow">PR</a>了解如何为其他管道启用它。</p>',Et,Ce,Ul="在注意力块中,输入被投影到三个子空间,分别由投影矩阵Q、K和V表示。这些投影通常单独计算,但您可以水平组合这些矩阵为一个矩阵,并在单步中执行投影。这会增加输入投影的矩阵乘法大小,并提高量化的效果。",Nt,Xe,Qt,Re,Ft,ke,Zl='<li><p>阅读<a href="https://pytorch.org/blog/presenting-flux-fast-making-flux-go-brrr-on-h100s/" rel="nofollow">Presenting Flux Fast: Making Flux go brrr on H100s</a>博客文章,了解如何结合所有这些优化与<a href="https://docs.pytorch.org/docs/stable/torch.compiler.html" rel="nofollow">TorchInductor</a>和<a href="https://docs.pytorch.org/docs/stable/torch.compiler_aot_inductor.html" rel="nofollow">AOTInductor</a>,使用<a href="https://github.com/huggingface/flux-fast" rel="nofollow">flux-fast</a>的配方获得约2.5倍的加速。</p> <p>这些配方支持AMD硬件和<a href="https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev" rel="nofollow">Flux.1 Kontext Dev</a>。</p></li> <li><p>阅读<a href="https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/" rel="nofollow">torch.compile和Diffusers:峰值性能实践指南</a>博客文章,了解如何在使用<code>torch.compile</code>时最大化性能。</p></li>',Yt,Ie,St,Le,zt;return y=new W({props:{title:"加速推理",local:"加速推理",headingTag:"h1"}}),L=new W({props:{title:"模型数据类型",local:"模型数据类型",headingTag:"h2"}}),v=new Nl({props:{id:"dtypes",options:["bfloat16","float16","TensorFloat-32"],$$slots:{default:[Sl]},$$scope:{ctx:j}}}),E=new W({props:{title:"缩放点积注意力",local:"缩放点积注意力",headingTag:"h2"}}),C=new Ve({props:{warning:!1,$$slots:{default:[zl]},$$scope:{ctx:j}}}),F=new G({props:{code:"ZnJvbSUyMHRvcmNoLm5uLmF0dGVudGlvbiUyMGltcG9ydCUyMFNEUEJhY2tlbmQlMkMlMjBzZHBhX2tlcm5lbCUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFN0YWJsZURpZmZ1c2lvblhMUGlwZWxpbmUlMEElMEFwaXBlbGluZSUyMCUzRCUyMFN0YWJsZURpZmZ1c2lvblhMUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMnN0YWJpbGl0eWFpJTJGc3RhYmxlLWRpZmZ1c2lvbi14bC1iYXNlLTEuMCUyMiUyQyUyMHRvcmNoX2R0eXBlJTNEdG9yY2guYmZsb2F0MTYlMEEpLnRvKCUyMmN1ZGElMjIpJTBBJTBBcHJvbXB0JTIwJTNEJTIwJTIyQXN0cm9uYXV0JTIwaW4lMjBhJTIwanVuZ2xlJTJDJTIwY29sZCUyMGNvbG9yJTIwcGFsZXR0ZSUyQyUyMG11dGVkJTIwY29sb3JzJTJDJTIwZGV0YWlsZWQlMkMlMjA4ayUyMiUwQSUwQXdpdGglMjBzZHBhX2tlcm5lbChTRFBCYWNrZW5kLkVGRklDSUVOVF9BVFRFTlRJT04pJTNBJTBBJTIwJTIwaW1hZ2UlMjAlM0QlMjBwaXBlbGluZShwcm9tcHQlMkMlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMzApLmltYWdlcyU1QjAlNUQ=",highlighted:`<span class="hljs-keyword">from</span> torch.nn.attention <span class="hljs-keyword">import</span> SDPBackend, sdpa_kernel
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
<span class="hljs-keyword">with</span> sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
image = pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),Y=new W({props:{title:"torch.compile",local:"torchcompile",headingTag:"h2"}}),A=new G({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuY29udl8xeDFfYXNfbW0lMjAlM0QlMjBUcnVlJTBBdG9yY2guX2luZHVjdG9yLmNvbmZpZy5jb29yZGluYXRlX2Rlc2NlbnRfdHVuaW5nJTIwJTNEJTIwVHJ1ZSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuZXBpbG9ndWVfZnVzaW9uJTIwJTNEJTIwRmFsc2UlMEF0b3JjaC5faW5kdWN0b3IuY29uZmlnLmNvb3JkaW5hdGVfZGVzY2VudF9jaGVja19hbGxfZGlyZWN0aW9ucyUyMCUzRCUyMFRydWU=",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
torch._inductor.config.conv_1x1_as_mm = <span class="hljs-literal">True</span>
torch._inductor.config.coordinate_descent_tuning = <span class="hljs-literal">True</span>
torch._inductor.config.epilogue_fusion = <span class="hljs-literal">False</span>
torch._inductor.config.coordinate_descent_check_all_directions = <span class="hljs-literal">True</span>`,wrap:!1}}),X=new Ve({props:{warning:!1,$$slots:{default:[Al]},$$scope:{ctx:j}}}),D=new G({props:{code:"cGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMEEpLnRvKCUyMmN1ZGElMjIpJTBBcGlwZWxpbmUudW5ldC50byhtZW1vcnlfZm9ybWF0JTNEdG9yY2guY2hhbm5lbHNfbGFzdCklMEFwaXBlbGluZS52YWUudG8obWVtb3J5X2Zvcm1hdCUzRHRvcmNoLmNoYW5uZWxzX2xhc3QpJTBBcGlwZWxpbmUudW5ldCUyMCUzRCUyMHRvcmNoLmNvbXBpbGUoJTBBJTIwJTIwJTIwJTIwcGlwZWxpbmUudW5ldCUyQyUyMG1vZGUlM0QlMjJtYXgtYXV0b3R1bmUlMjIlMkMlMjBmdWxsZ3JhcGglM0RUcnVlJTBBKSUwQXBpcGVsaW5lLnZhZS5kZWNvZGUlMjAlM0QlMjB0b3JjaC5jb21waWxlKCUwQSUyMCUyMCUyMCUyMHBpcGVsaW5lLnZhZS5kZWNvZGUlMkMlMEElMjAlMjAlMjAlMjBtb2RlJTNEJTIybWF4LWF1dG90dW5lJTIyJTJDJTBBJTIwJTIwJTIwJTIwZnVsbGdyYXBoJTNEVHJ1ZSUwQSklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJBc3Ryb25hdXQlMjBpbiUyMGElMjBqdW5nbGUlMkMlMjBjb2xkJTIwY29sb3IlMjBwYWxldHRlJTJDJTIwbXV0ZWQlMjBjb2xvcnMlMkMlMjBkZXRhaWxlZCUyQyUyMDhrJTIyJTBBcGlwZWxpbmUocHJvbXB0JTJDJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDMwKS5pbWFnZXMlNUIwJTVE",highlighted:`pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
pipeline.unet.to(memory_format=torch.channels_last)
pipeline.vae.to(memory_format=torch.channels_last)
pipeline.unet = torch.<span class="hljs-built_in">compile</span>(
pipeline.unet, mode=<span class="hljs-string">&quot;max-autotune&quot;</span>, fullgraph=<span class="hljs-literal">True</span>
)
pipeline.vae.decode = torch.<span class="hljs-built_in">compile</span>(
pipeline.vae.decode,
mode=<span class="hljs-string">&quot;max-autotune&quot;</span>,
fullgraph=<span class="hljs-literal">True</span>
)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),O=new W({props:{title:"动态形状编译",local:"动态形状编译",headingTag:"h3"}}),R=new Ve({props:{warning:!1,$$slots:{default:[Pl]},$$scope:{ctx:j}}}),le=new G({props:{code:"JTJCJTIwdG9yY2guZnguZXhwZXJpbWVudGFsLl9jb25maWcudXNlX2R1Y2tfc2hhcGUlMjAlM0QlMjBGYWxzZSUwQSUyQiUyMHBpcGVsaW5lLnVuZXQlMjAlM0QlMjB0b3JjaC5jb21waWxlKCUwQSUyMCUyMCUyMCUyMHBpcGVsaW5lLnVuZXQlMkMlMjBmdWxsZ3JhcGglM0RUcnVlJTJDJTIwZHluYW1pYyUzRFRydWUlMEEp",highlighted:`<span class="hljs-addition">+ torch.fx.experimental._config.use_duck_shape = False</span>
<span class="hljs-addition">+ pipeline.unet = torch.compile(</span>
pipeline.unet, fullgraph=True, dynamic=True
)`,wrap:!1}}),pe=new W({props:{title:"区域编译",local:"区域编译",headingTag:"h3"}}),fe=new G({props:{code:"JTIzJTIwcGlwJTIwaW5zdGFsbCUyMC1VJTIwZGlmZnVzZXJzJTBBaW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmlsaXR5YWklMkZzdGFibGUtZGlmZnVzaW9uLXhsLWJhc2UtMS4wJTIyJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTJDJTBBKS50byglMjJjdWRhJTIyKSUwQSUwQSUyMyUyMCVFNCVCQiU4NSVFNyVCQyU5NiVFOCVBRiU5MVVOZXQlRTQlQjglQUQlRTklODclOEQlRTUlQTQlOEQlRTclOUElODR0cmFuc2Zvcm1lciVFNSVCMSU4MiUwQXBpcGVsaW5lLnVuZXQuY29tcGlsZV9yZXBlYXRlZF9ibG9ja3MoZnVsbGdyYXBoJTNEVHJ1ZSk=",highlighted:`<span class="hljs-comment"># pip install -U diffusers</span>
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>,
torch_dtype=torch.float16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-comment"># 仅编译UNet中重复的transformer层</span>
pipeline.unet.compile_repeated_blocks(fullgraph=<span class="hljs-literal">True</span>)`,wrap:!1}}),me=new G({props:{code:"Y2xhc3MlMjBNeVVOZXQoTW9kZWxNaXhpbiklM0ElMEElMjAlMjAlMjAlMjBfcmVwZWF0ZWRfYmxvY2tzJTIwJTNEJTIwKCUyMlRyYW5zZm9ybWVyMkRNb2RlbCUyMiUyQyklMjAlMjAlMjMlMjAlRTIlODYlOTAlMjAlRTklQkIlOTglRTglQUUlQTQlRTclQkMlOTYlRTglQUYlOTE=",highlighted:`<span class="hljs-keyword">class</span> <span class="hljs-title class_">MyUNet</span>(<span class="hljs-title class_ inherited__">ModelMixin</span>):
_repeated_blocks = (<span class="hljs-string">&quot;Transformer2DModel&quot;</span>,) <span class="hljs-comment"># ← 默认编译</span>`,wrap:!1}}),k=new Ve({props:{warning:!1,$$slots:{default:[ql]},$$scope:{ctx:j}}}),ue=new G({props:{code:"JTIzJTIwcGlwJTIwaW5zdGFsbCUyMC1VJTIwYWNjZWxlcmF0ZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFN0YWJsZURpZmZ1c2lvblhMUGlwZWxpbmUlMEFmcm9tJTIwYWNjZWxlcmF0ZS51dGlscyUyMGltcG9ydCUyMGNvbXBpbGUlMjByZWdpb25zJTBBJTBBcGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMEEpLnRvKCUyMmN1ZGElMjIpJTBBcGlwZWxpbmUudW5ldCUyMCUzRCUyMGNvbXBpbGVfcmVnaW9ucyhwaXBlbGluZS51bmV0JTJDJTIwbW9kZSUzRCUyMnJlZHVjZS1vdmVyaGVhZCUyMiUyQyUyMGZ1bGxncmFwaCUzRFRydWUp",highlighted:`<span class="hljs-comment"># pip install -U accelerate</span>
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
<span class="hljs-keyword">from</span> accelerate.utils <span class="hljs-keyword">import</span> <span class="hljs-built_in">compile</span> regions
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
pipeline.unet = compile_regions(pipeline.unet, mode=<span class="hljs-string">&quot;reduce-overhead&quot;</span>, fullgraph=<span class="hljs-literal">True</span>)`,wrap:!1}}),de=new W({props:{title:"图中断",local:"图中断",headingTag:"h3"}}),be=new G({props:{code:"LSUyMGxhdGVudHMlMjAlM0QlMjB1bmV0KCUwQS0lMjAlMjAlMjBsYXRlbnRzJTJDJTIwdGltZXN0ZXAlM0R0aW1lc3RlcCUyQyUyMGVuY29kZXJfaGlkZGVuX3N0YXRlcyUzRHByb21wdF9lbWJlZHMlMEEtKS5zYW1wbGUlMEElMEElMkIlMjBsYXRlbnRzJTIwJTNEJTIwdW5ldCglMEElMkIlMjAlMjAlMjBsYXRlbnRzJTJDJTIwdGltZXN0ZXAlM0R0aW1lc3RlcCUyQyUyMGVuY29kZXJfaGlkZGVuX3N0YXRlcyUzRHByb21wdF9lbWJlZHMlMkMlMjByZXR1cm5fZGljdCUzREZhbHNlJTBBJTJCKSU1QjAlNUQ=",highlighted:`<span class="hljs-deletion">- latents = unet(</span>
<span class="hljs-deletion">- latents, timestep=timestep, encoder_hidden_states=prompt_embeds</span>
<span class="hljs-deletion">-).sample</span>
<span class="hljs-addition">+ latents = unet(</span>
<span class="hljs-addition">+ latents, timestep=timestep, encoder_hidden_states=prompt_embeds, return_dict=False</span>
<span class="hljs-addition">+)[0]</span>`,wrap:!1}}),$e=new W({props:{title:"GPU同步",local:"gpu同步",headingTag:"h3"}}),I=new Ve({props:{$$slots:{default:[Dl]},$$scope:{ctx:j}}}),ye=new W({props:{title:"基准测试",local:"基准测试",headingTag:"h3"}}),Ue=new W({props:{title:"动态量化",local:"动态量化",headingTag:"h2"}}),V=new Ve({props:{warning:!1,$$slots:{default:[Kl]},$$scope:{ctx:j}}}),_e=new G({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwdG9yY2hhbyUyMGltcG9ydCUyMGFwcGx5X2R5bmFtaWNfcXVhbnQlMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuY29udl8xeDFfYXNfbW0lMjAlM0QlMjBUcnVlJTBBdG9yY2guX2luZHVjdG9yLmNvbmZpZy5jb29yZGluYXRlX2Rlc2NlbnRfdHVuaW5nJTIwJTNEJTIwVHJ1ZSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuZXBpbG9ndWVfZnVzaW9uJTIwJTNEJTIwRmFsc2UlMEF0b3JjaC5faW5kdWN0b3IuY29uZmlnLmNvb3JkaW5hdGVfZGVzY2VudF9jaGVja19hbGxfZGlyZWN0aW9ucyUyMCUzRCUyMFRydWUlMEF0b3JjaC5faW5kdWN0b3IuY29uZmlnLmZvcmNlX2Z1c2VfaW50X21tX3dpdGhfbXVsJTIwJTNEJTIwVHJ1ZSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcudXNlX21peGVkX21tJTIwJTNEJTIwVHJ1ZQ==",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> torchao <span class="hljs-keyword">import</span> apply_dynamic_quant
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
torch._inductor.config.conv_1x1_as_mm = <span class="hljs-literal">True</span>
torch._inductor.config.coordinate_descent_tuning = <span class="hljs-literal">True</span>
torch._inductor.config.epilogue_fusion = <span class="hljs-literal">False</span>
torch._inductor.config.coordinate_descent_check_all_directions = <span class="hljs-literal">True</span>
torch._inductor.config.force_fuse_int_mm_with_mul = <span class="hljs-literal">True</span>
torch._inductor.config.use_mixed_mm = <span class="hljs-literal">True</span>`,wrap:!1}}),Be=new G({props:{code:"cGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmJmbG9hdDE2JTBBKS50byglMjJjdWRhJTIyKSUwQSUwQWFwcGx5X2R5bmFtaWNfcXVhbnQocGlwZWxpbmUudW5ldCUyQyUyMGR5bmFtaWNfcXVhbnRfZmlsdGVyX2ZuKSUwQWFwcGx5X2R5bmFtaWNfcXVhbnQocGlwZWxpbmUudmFlJTJDJTIwZHluYW1pY19xdWFudF9maWx0ZXJfZm4pJTBBJTBBcHJvbXB0JTIwJTNEJTIwJTIyQXN0cm9uYXV0JTIwaW4lMjBhJTIwanVuZ2xlJTJDJTIwY29sZCUyMGNvbG9yJTIwcGFsZXR0ZSUyQyUyMG11dGVkJTIwY29sb3JzJTJDJTIwZGV0YWlsZWQlMkMlMjA4ayUyMiUwQXBpcGVsaW5lKHByb21wdCUyQyUyMG51bV9pbmZlcmVuY2Vfc3RlcHMlM0QzMCkuaW1hZ2VzJTVCMCU1RA==",highlighted:`pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
apply_dynamic_quant(pipeline.unet, dynamic_quant_filter_fn)
apply_dynamic_quant(pipeline.vae, dynamic_quant_filter_fn)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),xe=new W({props:{title:"融合投影矩阵",local:"融合投影矩阵",headingTag:"h2"}}),Xe=new G({props:{code:"cGlwZWxpbmUuZnVzZV9xa3ZfcHJvamVjdGlvbnMoKQ==",highlighted:"pipeline.fuse_qkv_projections()",wrap:!1}}),Re=new W({props:{title:"资源",local:"资源",headingTag:"h2"}}),Ie=new El({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/zh/optimization/fp16.md"}}),{c(){n=f("meta"),T=a(),i=f("p"),m=a(),h(y.$$.fragment),c=a(),o=f("p"),o.innerHTML=U,Z=a(),_=f("p"),_.textContent=B,J=a(),g=f("p"),g.textContent=Pt,He=a(),h(L.$$.fragment),Ee=a(),H=f("p"),H.textContent=qt,Ne=a(),h(v.$$.fragment),Qe=a(),h(E.$$.fragment),Fe=a(),h(C.$$.fragment),Ye=a(),N=f("p"),N.innerHTML=Dt,Se=a(),Q=f("p"),Q.innerHTML=Kt,ze=a(),h(F.$$.fragment),Ae=a(),h(Y.$$.fragment),Pe=a(),S=f("p"),S.innerHTML=Ot,qe=a(),z=f("p"),z.innerHTML=el,De=a(),h(A.$$.fragment),Ke=a(),P=f("p"),P.innerHTML=tl,Oe=a(),h(X.$$.fragment),et=a(),q=f("p"),q.innerHTML=ll,tt=a(),h(D.$$.fragment),lt=a(),K=f("p"),K.textContent=sl,st=a(),h(O.$$.fragment),nt=a(),h(R.$$.fragment),at=a(),ee=f("p"),ee.innerHTML=nl,pt=a(),te=f("p"),te.innerHTML=al,it=a(),h(le.$$.fragment),ot=a(),se=f("p"),se.innerHTML=pl,ft=a(),ne=f("p"),ne.innerHTML=il,rt=a(),ae=f("p"),ae.textContent=ol,mt=a(),h(pe.$$.fragment),ct=a(),ie=f("p"),ie.innerHTML=fl,ut=a(),oe=f("p"),oe.innerHTML=rl,ht=a(),h(fe.$$.fragment),dt=a(),re=f("p"),re.innerHTML=ml,Mt=a(),h(me.$$.fragment),bt=a(),h(k.$$.fragment),$t=a(),ce=f("p"),ce.innerHTML=cl,wt=a(),h(ue.$$.fragment),Tt=a(),he=f("p"),he.innerHTML=ul,yt=a(),h(de.$$.fragment),Jt=a(),Me=f("p"),Me.innerHTML=hl,jt=a(),h(be.$$.fragment),Ut=a(),h($e.$$.fragment),Zt=a(),we=f("p"),we.innerHTML=dl,gt=a(),Te=f("p"),Te.innerHTML=Ml,Gt=a(),h(I.$$.fragment),_t=a(),h(ye.$$.fragment),Wt=a(),Je=f("p"),Je.innerHTML=bl,Bt=a(),je=f("p"),je.innerHTML=$l,xt=a(),h(Ue.$$.fragment),vt=a(),Ze=f("p"),Ze.innerHTML=wl,Ct=a(),ge=f("p"),ge.innerHTML=Tl,Xt=a(),h(V.$$.fragment),Rt=a(),Ge=f("p"),Ge.textContent=yl,kt=a(),h(_e.$$.fragment),It=a(),We=f("p"),We.innerHTML=Jl,Vt=a(),h(Be.$$.fragment),Lt=a(),h(xe.$$.fragment),Ht=a(),ve=f("blockquote"),ve.innerHTML=jl,Et=a(),Ce=f("p"),Ce.textContent=Ul,Nt=a(),h(Xe.$$.fragment),Qt=a(),h(Re.$$.fragment),Ft=a(),ke=f("ul"),ke.innerHTML=Zl,Yt=a(),h(Ie.$$.fragment),St=a(),Le=f("p"),this.h()},l(e){const t=Ll("svelte-u9bgzb",document.head);n=r(t,"META",{name:!0,content:!0}),t.forEach(l),T=p(e),i=r(e,"P",{}),Cl(i).forEach(l),m=p(e),d(y.$$.fragment,e),c=p(e),o=r(e,"P",{"data-svelte-h":!0}),u(o)!=="svelte-194j5uc"&&(o.innerHTML=U),Z=p(e),_=r(e,"P",{"data-svelte-h":!0}),u(_)!=="svelte-k2hqss"&&(_.textContent=B),J=p(e),g=r(e,"P",{"data-svelte-h":!0}),u(g)!=="svelte-x0c89o"&&(g.textContent=Pt),He=p(e),d(L.$$.fragment,e),Ee=p(e),H=r(e,"P",{"data-svelte-h":!0}),u(H)!=="svelte-py0rfh"&&(H.textContent=qt),Ne=p(e),d(v.$$.fragment,e),Qe=p(e),d(E.$$.fragment,e),Fe=p(e),d(C.$$.fragment,e),Ye=p(e),N=r(e,"P",{"data-svelte-h":!0}),u(N)!=="svelte-3w1pt5"&&(N.innerHTML=Dt),Se=p(e),Q=r(e,"P",{"data-svelte-h":!0}),u(Q)!=="svelte-qbrif4"&&(Q.innerHTML=Kt),ze=p(e),d(F.$$.fragment,e),Ae=p(e),d(Y.$$.fragment,e),Pe=p(e),S=r(e,"P",{"data-svelte-h":!0}),u(S)!=="svelte-1pe18dl"&&(S.innerHTML=Ot),qe=p(e),z=r(e,"P",{"data-svelte-h":!0}),u(z)!=="svelte-g36iym"&&(z.innerHTML=el),De=p(e),d(A.$$.fragment,e),Ke=p(e),P=r(e,"P",{"data-svelte-h":!0}),u(P)!=="svelte-1wsgro0"&&(P.innerHTML=tl),Oe=p(e),d(X.$$.fragment,e),et=p(e),q=r(e,"P",{"data-svelte-h":!0}),u(q)!=="svelte-9ijocs"&&(q.innerHTML=ll),tt=p(e),d(D.$$.fragment,e),lt=p(e),K=r(e,"P",{"data-svelte-h":!0}),u(K)!=="svelte-15it70t"&&(K.textContent=sl),st=p(e),d(O.$$.fragment,e),nt=p(e),d(R.$$.fragment,e),at=p(e),ee=r(e,"P",{"data-svelte-h":!0}),u(ee)!=="svelte-1ahxi03"&&(ee.innerHTML=nl),pt=p(e),te=r(e,"P",{"data-svelte-h":!0}),u(te)!=="svelte-169dym5"&&(te.innerHTML=al),it=p(e),d(le.$$.fragment,e),ot=p(e),se=r(e,"P",{"data-svelte-h":!0}),u(se)!=="svelte-jmoihs"&&(se.innerHTML=pl),ft=p(e),ne=r(e,"P",{"data-svelte-h":!0}),u(ne)!=="svelte-w2qqjo"&&(ne.innerHTML=il),rt=p(e),ae=r(e,"P",{"data-svelte-h":!0}),u(ae)!=="svelte-dsv3x4"&&(ae.textContent=ol),mt=p(e),d(pe.$$.fragment,e),ct=p(e),ie=r(e,"P",{"data-svelte-h":!0}),u(ie)!=="svelte-1nyxskn"&&(ie.innerHTML=fl),ut=p(e),oe=r(e,"P",{"data-svelte-h":!0}),u(oe)!=="svelte-1qqd039"&&(oe.innerHTML=rl),ht=p(e),d(fe.$$.fragment,e),dt=p(e),re=r(e,"P",{"data-svelte-h":!0}),u(re)!=="svelte-wmi0r6"&&(re.innerHTML=ml),Mt=p(e),d(me.$$.fragment,e),bt=p(e),d(k.$$.fragment,e),$t=p(e),ce=r(e,"P",{"data-svelte-h":!0}),u(ce)!=="svelte-6v6dgf"&&(ce.innerHTML=cl),wt=p(e),d(ue.$$.fragment,e),Tt=p(e),he=r(e,"P",{"data-svelte-h":!0}),u(he)!=="svelte-zlqqst"&&(he.innerHTML=ul),yt=p(e),d(de.$$.fragment,e),Jt=p(e),Me=r(e,"P",{"data-svelte-h":!0}),u(Me)!=="svelte-qa92su"&&(Me.innerHTML=hl),jt=p(e),d(be.$$.fragment,e),Ut=p(e),d($e.$$.fragment,e),Zt=p(e),we=r(e,"P",{"data-svelte-h":!0}),u(we)!=="svelte-jzcid"&&(we.innerHTML=dl),gt=p(e),Te=r(e,"P",{"data-svelte-h":!0}),u(Te)!=="svelte-11fadgl"&&(Te.innerHTML=Ml),Gt=p(e),d(I.$$.fragment,e),_t=p(e),d(ye.$$.fragment,e),Wt=p(e),Je=r(e,"P",{"data-svelte-h":!0}),u(Je)!=="svelte-h31nps"&&(Je.innerHTML=bl),Bt=p(e),je=r(e,"P",{"data-svelte-h":!0}),u(je)!=="svelte-oh3qub"&&(je.innerHTML=$l),xt=p(e),d(Ue.$$.fragment,e),vt=p(e),Ze=r(e,"P",{"data-svelte-h":!0}),u(Ze)!=="svelte-1esyvy8"&&(Ze.innerHTML=wl),Ct=p(e),ge=r(e,"P",{"data-svelte-h":!0}),u(ge)!=="svelte-10dayln"&&(ge.innerHTML=Tl),Xt=p(e),d(V.$$.fragment,e),Rt=p(e),Ge=r(e,"P",{"data-svelte-h":!0}),u(Ge)!=="svelte-19bhy1d"&&(Ge.textContent=yl),kt=p(e),d(_e.$$.fragment,e),It=p(e),We=r(e,"P",{"data-svelte-h":!0}),u(We)!=="svelte-17j89ie"&&(We.innerHTML=Jl),Vt=p(e),d(Be.$$.fragment,e),Lt=p(e),d(xe.$$.fragment,e),Ht=p(e),ve=r(e,"BLOCKQUOTE",{"data-svelte-h":!0}),u(ve)!=="svelte-tty6r3"&&(ve.innerHTML=jl),Et=p(e),Ce=r(e,"P",{"data-svelte-h":!0}),u(Ce)!=="svelte-qv5ldk"&&(Ce.textContent=Ul),Nt=p(e),d(Xe.$$.fragment,e),Qt=p(e),d(Re.$$.fragment,e),Ft=p(e),ke=r(e,"UL",{"data-svelte-h":!0}),u(ke)!=="svelte-58n17n"&&(ke.innerHTML=Zl),Yt=p(e),d(Ie.$$.fragment,e),St=p(e),Le=r(e,"P",{}),Cl(Le).forEach(l),this.h()},h(){Xl(n,"name","hf:doc:metadata"),Xl(n,"content",es)},m(e,t){Hl(document.head,n),s(e,T,t),s(e,i,t),s(e,m,t),M(y,e,t),s(e,c,t),s(e,o,t),s(e,Z,t),s(e,_,t),s(e,J,t),s(e,g,t),s(e,He,t),M(L,e,t),s(e,Ee,t),s(e,H,t),s(e,Ne,t),M(v,e,t),s(e,Qe,t),M(E,e,t),s(e,Fe,t),M(C,e,t),s(e,Ye,t),s(e,N,t),s(e,Se,t),s(e,Q,t),s(e,ze,t),M(F,e,t),s(e,Ae,t),M(Y,e,t),s(e,Pe,t),s(e,S,t),s(e,qe,t),s(e,z,t),s(e,De,t),M(A,e,t),s(e,Ke,t),s(e,P,t),s(e,Oe,t),M(X,e,t),s(e,et,t),s(e,q,t),s(e,tt,t),M(D,e,t),s(e,lt,t),s(e,K,t),s(e,st,t),M(O,e,t),s(e,nt,t),M(R,e,t),s(e,at,t),s(e,ee,t),s(e,pt,t),s(e,te,t),s(e,it,t),M(le,e,t),s(e,ot,t),s(e,se,t),s(e,ft,t),s(e,ne,t),s(e,rt,t),s(e,ae,t),s(e,mt,t),M(pe,e,t),s(e,ct,t),s(e,ie,t),s(e,ut,t),s(e,oe,t),s(e,ht,t),M(fe,e,t),s(e,dt,t),s(e,re,t),s(e,Mt,t),M(me,e,t),s(e,bt,t),M(k,e,t),s(e,$t,t),s(e,ce,t),s(e,wt,t),M(ue,e,t),s(e,Tt,t),s(e,he,t),s(e,yt,t),M(de,e,t),s(e,Jt,t),s(e,Me,t),s(e,jt,t),M(be,e,t),s(e,Ut,t),M($e,e,t),s(e,Zt,t),s(e,we,t),s(e,gt,t),s(e,Te,t),s(e,Gt,t),M(I,e,t),s(e,_t,t),M(ye,e,t),s(e,Wt,t),s(e,Je,t),s(e,Bt,t),s(e,je,t),s(e,xt,t),M(Ue,e,t),s(e,vt,t),s(e,Ze,t),s(e,Ct,t),s(e,ge,t),s(e,Xt,t),M(V,e,t),s(e,Rt,t),s(e,Ge,t),s(e,kt,t),M(_e,e,t),s(e,It,t),s(e,We,t),s(e,Vt,t),M(Be,e,t),s(e,Lt,t),M(xe,e,t),s(e,Ht,t),s(e,ve,t),s(e,Et,t),s(e,Ce,t),s(e,Nt,t),M(Xe,e,t),s(e,Qt,t),M(Re,e,t),s(e,Ft,t),s(e,ke,t),s(e,Yt,t),M(Ie,e,t),s(e,St,t),s(e,Le,t),zt=!0},p(e,[t]){const gl={};t&2&&(gl.$$scope={dirty:t,ctx:e}),v.$set(gl);const Gl={};t&2&&(Gl.$$scope={dirty:t,ctx:e}),C.$set(Gl);const _l={};t&2&&(_l.$$scope={dirty:t,ctx:e}),X.$set(_l);const Wl={};t&2&&(Wl.$$scope={dirty:t,ctx:e}),R.$set(Wl);const Bl={};t&2&&(Bl.$$scope={dirty:t,ctx:e}),k.$set(Bl);const xl={};t&2&&(xl.$$scope={dirty:t,ctx:e}),I.$set(xl);const vl={};t&2&&(vl.$$scope={dirty:t,ctx:e}),V.$set(vl)},i(e){zt||(b(y.$$.fragment,e),b(L.$$.fragment,e),b(v.$$.fragment,e),b(E.$$.fragment,e),b(C.$$.fragment,e),b(F.$$.fragment,e),b(Y.$$.fragment,e),b(A.$$.fragment,e),b(X.$$.fragment,e),b(D.$$.fragment,e),b(O.$$.fragment,e),b(R.$$.fragment,e),b(le.$$.fragment,e),b(pe.$$.fragment,e),b(fe.$$.fragment,e),b(me.$$.fragment,e),b(k.$$.fragment,e),b(ue.$$.fragment,e),b(de.$$.fragment,e),b(be.$$.fragment,e),b($e.$$.fragment,e),b(I.$$.fragment,e),b(ye.$$.fragment,e),b(Ue.$$.fragment,e),b(V.$$.fragment,e),b(_e.$$.fragment,e),b(Be.$$.fragment,e),b(xe.$$.fragment,e),b(Xe.$$.fragment,e),b(Re.$$.fragment,e),b(Ie.$$.fragment,e),zt=!0)},o(e){$(y.$$.fragment,e),$(L.$$.fragment,e),$(v.$$.fragment,e),$(E.$$.fragment,e),$(C.$$.fragment,e),$(F.$$.fragment,e),$(Y.$$.fragment,e),$(A.$$.fragment,e),$(X.$$.fragment,e),$(D.$$.fragment,e),$(O.$$.fragment,e),$(R.$$.fragment,e),$(le.$$.fragment,e),$(pe.$$.fragment,e),$(fe.$$.fragment,e),$(me.$$.fragment,e),$(k.$$.fragment,e),$(ue.$$.fragment,e),$(de.$$.fragment,e),$(be.$$.fragment,e),$($e.$$.fragment,e),$(I.$$.fragment,e),$(ye.$$.fragment,e),$(Ue.$$.fragment,e),$(V.$$.fragment,e),$(_e.$$.fragment,e),$(Be.$$.fragment,e),$(xe.$$.fragment,e),$(Xe.$$.fragment,e),$(Re.$$.fragment,e),$(Ie.$$.fragment,e),zt=!1},d(e){e&&(l(T),l(i),l(m),l(c),l(o),l(Z),l(_),l(J),l(g),l(He),l(Ee),l(H),l(Ne),l(Qe),l(Fe),l(Ye),l(N),l(Se),l(Q),l(ze),l(Ae),l(Pe),l(S),l(qe),l(z),l(De),l(Ke),l(P),l(Oe),l(et),l(q),l(tt),l(lt),l(K),l(st),l(nt),l(at),l(ee),l(pt),l(te),l(it),l(ot),l(se),l(ft),l(ne),l(rt),l(ae),l(mt),l(ct),l(ie),l(ut),l(oe),l(ht),l(dt),l(re),l(Mt),l(bt),l($t),l(ce),l(wt),l(Tt),l(he),l(yt),l(Jt),l(Me),l(jt),l(Ut),l(Zt),l(we),l(gt),l(Te),l(Gt),l(_t),l(Wt),l(Je),l(Bt),l(je),l(xt),l(vt),l(Ze),l(Ct),l(ge),l(Xt),l(Rt),l(Ge),l(kt),l(It),l(We),l(Vt),l(Lt),l(Ht),l(ve),l(Et),l(Ce),l(Nt),l(Qt),l(Ft),l(ke),l(Yt),l(St),l(Le)),l(n),w(y,e),w(L,e),w(v,e),w(E,e),w(C,e),w(F,e),w(Y,e),w(A,e),w(X,e),w(D,e),w(O,e),w(R,e),w(le,e),w(pe,e),w(fe,e),w(me,e),w(k,e),w(ue,e),w(de,e),w(be,e),w($e,e),w(I,e),w(ye,e),w(Ue,e),w(V,e),w(_e,e),w(Be,e),w(xe,e),w(Xe,e),w(Re,e),w(Ie,e)}}}const es='{"title":"加速推理","local":"加速推理","sections":[{"title":"模型数据类型","local":"模型数据类型","sections":[],"depth":2},{"title":"缩放点积注意力","local":"缩放点积注意力","sections":[],"depth":2},{"title":"torch.compile","local":"torchcompile","sections":[{"title":"动态形状编译","local":"动态形状编译","sections":[],"depth":3},{"title":"区域编译","local":"区域编译","sections":[],"depth":3},{"title":"图中断","local":"图中断","sections":[],"depth":3},{"title":"GPU同步","local":"gpu同步","sections":[],"depth":3},{"title":"基准测试","local":"基准测试","sections":[],"depth":3}],"depth":2},{"title":"动态量化","local":"动态量化","sections":[],"depth":2},{"title":"融合投影矩阵","local":"融合投影矩阵","sections":[],"depth":2},{"title":"资源","local":"资源","sections":[],"depth":2}],"depth":1}';function ts(j){return kl(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class os extends Il{constructor(n){super(),Vl(this,n,ts,Ol,Rl,{})}}export{os as component};

Xet Storage Details

Size:
44 kB
·
Xet hash:
d8ca336b3f125ef38acfcc610302d807ecc6d9d98883342d16e857fb2fbb48b2

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.