Buckets:

rtrm's picture
download
raw
42.9 kB
import{s as kt,o as Xt,n as zl}from"../chunks/scheduler.5c93273d.js";import{S as Rt,i as It,g as o,s as n,r as h,A as Vt,h as r,f as t,c as a,j as vt,u as d,x as c,k as Ie,y as Lt,a as s,v as M,d as b,t as T,w}from"../chunks/index.e43dd92b.js";import{T as Ht}from"../chunks/Tip.3538f9e3.js";import{C as G}from"../chunks/CodeBlock.6896320e.js";import{H as W,E as Et}from"../chunks/getInferenceSnippets.161194d2.js";import{H as Qt,a as Al}from"../chunks/HfOption.d50154c3.js";function Nt(_){let i,j="bfloat16与float16类似,但对数值误差更稳健。硬件对bfloat16的支持各不相同,但大多数现代GPU都能支持bfloat16。",m,u,y;return u=new G({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmlsaXR5YWklMkZzdGFibGUtZGlmZnVzaW9uLXhsLWJhc2UtMS4wJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUwQSkudG8oJTIyY3VkYSUyMiklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJBc3Ryb25hdXQlMjBpbiUyMGElMjBqdW5nbGUlMkMlMjBjb2xkJTIwY29sb3IlMjBwYWxldHRlJTJDJTIwbXV0ZWQlMjBjb2xvcnMlMkMlMjBkZXRhaWxlZCUyQyUyMDhrJTIyJTBBcGlwZWxpbmUocHJvbXB0JTJDJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDMwKS5pbWFnZXMlNUIwJTVE",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),{c(){i=o("p"),i.textContent=j,m=n(),h(u.$$.fragment)},l(f){i=r(f,"P",{"data-svelte-h":!0}),c(i)!=="svelte-1u2chbj"&&(i.textContent=j),m=a(f),d(u.$$.fragment,f)},m(f,p){s(f,i,p),s(f,m,p),M(u,f,p),y=!0},p:zl,i(f){y||(b(u.$$.fragment,f),y=!0)},o(f){T(u.$$.fragment,f),y=!1},d(f){f&&(t(i),t(m)),w(u,f)}}}function Ft(_){let i,j="float16与bfloat16类似,但可能更容易出现数值误差。",m,u,y;return u=new G({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmlsaXR5YWklMkZzdGFibGUtZGlmZnVzaW9uLXhsLWJhc2UtMS4wJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTBBKS50byglMjJjdWRhJTIyKSUwQSUwQXByb21wdCUyMCUzRCUyMCUyMkFzdHJvbmF1dCUyMGluJTIwYSUyMGp1bmdsZSUyQyUyMGNvbGQlMjBjb2xvciUyMHBhbGV0dGUlMkMlMjBtdXRlZCUyMGNvbG9ycyUyQyUyMGRldGFpbGVkJTJDJTIwOGslMjIlMEFwaXBlbGluZShwcm9tcHQlMkMlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMzApLmltYWdlcyU1QjAlNUQ=",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),{c(){i=o("p"),i.textContent=j,m=n(),h(u.$$.fragment)},l(f){i=r(f,"P",{"data-svelte-h":!0}),c(i)!=="svelte-1nw1ct0"&&(i.textContent=j),m=a(f),d(u.$$.fragment,f)},m(f,p){s(f,i,p),s(f,m,p),M(u,f,p),y=!0},p:zl,i(f){y||(b(u.$$.fragment,f),y=!0)},o(f){T(u.$$.fragment,f),y=!1},d(f){f&&(t(i),t(m)),w(u,f)}}}function Yt(_){let i,j='<a href="https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/" rel="nofollow">TensorFloat-32 (tf32)</a>模式在NVIDIA Ampere GPU上受支持,它以tf32计算卷积和矩阵乘法运算。存储和其他操作保持在float32。与bfloat16或float16结合使用时,可以显著加快计算速度。',m,u,y="PyTorch默认仅对卷积启用tf32模式,您需要显式启用矩阵乘法的tf32模式。",f,p,$,U,g='更多详情请参阅<a href="https://huggingface.co/docs/transformers/en/perf_train_gpu_one#mixed-precision" rel="nofollow">混合精度训练</a>文档。',B;return p=new G({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXRvcmNoLmJhY2tlbmRzLmN1ZGEubWF0bXVsLmFsbG93X3RmMzIlMjAlM0QlMjBUcnVlJTBBJTBBcGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmJmbG9hdDE2JTBBKS50byglMjJjdWRhJTIyKSUwQSUwQXByb21wdCUyMCUzRCUyMCUyMkFzdHJvbmF1dCUyMGluJTIwYSUyMGp1bmdsZSUyQyUyMGNvbGQlMjBjb2xvciUyMHBhbGV0dGUlMkMlMjBtdXRlZCUyMGNvbG9ycyUyQyUyMGRldGFpbGVkJTJDJTIwOGslMjIlMEFwaXBlbGluZShwcm9tcHQlMkMlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMzApLmltYWdlcyU1QjAlNUQ=",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
torch.backends.cuda.matmul.allow_tf32 = <span class="hljs-literal">True</span>
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),{c(){i=o("p"),i.innerHTML=j,m=n(),u=o("p"),u.textContent=y,f=n(),h(p.$$.fragment),$=n(),U=o("p"),U.innerHTML=g},l(J){i=r(J,"P",{"data-svelte-h":!0}),c(i)!=="svelte-o6hhje"&&(i.innerHTML=j),m=a(J),u=r(J,"P",{"data-svelte-h":!0}),c(u)!=="svelte-1p6w426"&&(u.textContent=y),f=a(J),d(p.$$.fragment,J),$=a(J),U=r(J,"P",{"data-svelte-h":!0}),c(U)!=="svelte-1v2g6n5"&&(U.innerHTML=g)},m(J,Z){s(J,i,Z),s(J,m,Z),s(J,u,Z),s(J,f,Z),M(p,J,Z),s(J,$,Z),s(J,U,Z),B=!0},p:zl,i(J){B||(b(p.$$.fragment,J),B=!0)},o(J){T(p.$$.fragment,J),B=!1},d(J){J&&(t(i),t(m),t(u),t(f),t($),t(U)),w(p,J)}}}function St(_){let i,j,m,u,y,f;return i=new Al({props:{id:"dtypes",option:"bfloat16",$$slots:{default:[Nt]},$$scope:{ctx:_}}}),m=new Al({props:{id:"dtypes",option:"float16",$$slots:{default:[Ft]},$$scope:{ctx:_}}}),y=new Al({props:{id:"dtypes",option:"TensorFloat-32",$$slots:{default:[Yt]},$$scope:{ctx:_}}}),{c(){h(i.$$.fragment),j=n(),h(m.$$.fragment),u=n(),h(y.$$.fragment)},l(p){d(i.$$.fragment,p),j=a(p),d(m.$$.fragment,p),u=a(p),d(y.$$.fragment,p)},m(p,$){M(i,p,$),s(p,j,$),M(m,p,$),s(p,u,$),M(y,p,$),f=!0},p(p,$){const U={};$&2&&(U.$$scope={dirty:$,ctx:p}),i.$set(U);const g={};$&2&&(g.$$scope={dirty:$,ctx:p}),m.$set(g);const B={};$&2&&(B.$$scope={dirty:$,ctx:p}),y.$set(B)},i(p){f||(b(i.$$.fragment,p),b(m.$$.fragment,p),b(y.$$.fragment,p),f=!0)},o(p){T(i.$$.fragment,p),T(m.$$.fragment,p),T(y.$$.fragment,p),f=!1},d(p){p&&(t(j),t(u)),w(i,p),w(m,p),w(y,p)}}}function zt(_){let i,j='参阅<a href="https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/" rel="nofollow">torch.compile和Diffusers:峰值性能实践指南</a>博客文章,了解如何为扩散模型最大化<code>torch.compile</code>的性能。';return{c(){i=o("p"),i.innerHTML=j},l(m){i=r(m,"P",{"data-svelte-h":!0}),c(i)!=="svelte-fxt4zr"&&(i.innerHTML=j)},m(m,u){s(m,i,u)},p:zl,d(m){m&&t(i)}}}function At(_){let i,j,m,u,y,f,p,$='Diffusion模型在推理时速度较慢,因为生成是一个迭代过程,需要经过一定数量的”步数”逐步将噪声细化为图像或视频。要加速这一过程,您可以尝试使用不同的<a href="../api/schedulers/overview">调度器</a>、降低模型权重的精度以加快计算、使用更高效的内存注意力机制等方法。',U,g,B="将这些技术组合使用,可以比单独使用任何一种技术获得更快的推理速度。",J,Z,ql="本指南将介绍如何加速推理。",Le,V,He,L,Pl="模型权重的精度和数据类型会影响推理速度,因为更高的精度需要更多内存来加载,也需要更多时间进行计算。PyTorch默认以float32或全精度加载模型权重,因此更改数据类型是快速获得更快推理速度的简单方法。",Ee,x,Qe,H,Ne,C,Dl='<p>内存高效注意力优化了推理速度<em>和</em><a href="./memory#memory-efficient-attention">内存使用</a>!</p>',Fe,E,Kl='<a href="https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html" rel="nofollow">缩放点积注意力(SDPA)</a>实现了多种注意力后端,包括<a href="https://github.com/Dao-AILab/flash-attention" rel="nofollow">FlashAttention</a>、<a href="https://github.com/facebookresearch/xformers" rel="nofollow">xFormers</a>和原生C++实现。它会根据您的硬件自动选择最优的后端。',Ye,Q,Ol='如果您使用的是PyTorch &gt;= 2.0,SDPA默认启用,无需对代码进行任何额外更改。不过,您也可以尝试使用其他注意力后端来自行选择。下面的示例使用<a href="https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html" rel="nofollow">torch.nn.attention.sdpa_kernel</a>上下文管理器来启用高效注意力。',Se,N,ze,F,Ae,Y,et='<a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a>通过将PyTorch代码和操作编译为优化的内核来加速推理。Diffusers通常会编译计算密集型的模型,如UNet、transformer或VAE。',qe,S,lt='启用以下编译器设置以获得最大速度(更多选项请参阅<a href="https://github.com/pytorch/pytorch/blob/main/torch/_inductor/config.py" rel="nofollow">完整列表</a>)。',Pe,z,De,A,tt="加载并编译UNet和VAE。有几种不同的模式可供选择,但<code>&quot;max-autotune&quot;</code>通过编译为CUDA图来优化速度。CUDA图通过单个CPU操作启动多个GPU操作,有效减少了开销。",Ke,v,st='<p>在PyTorch 2.3.1中,您可以控制torch.compile的缓存行为。这对于像<code>&quot;max-autotune&quot;</code>这样的编译模式特别有用,它会通过网格搜索多个编译标志来找到最优配置。更多详情请参阅<a href="https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html" rel="nofollow">torch.compile中的编译时间缓存</a>教程。</p>',Oe,q,nt='将内存布局更改为<a href="./memory#torchchannels_last">channels_last</a>也可以优化内存和推理速度。',el,P,ll,D,at="第一次编译时速度较慢,但一旦编译完成,速度会显著提升。尽量只在相同类型的推理操作上使用编译后的管道。在不同尺寸的图像上调用编译后的管道会重新触发编译,这会很慢且效率低下。",tl,K,sl,k,pt="<p>确保始终使用PyTorch的nightly版本以获得更好的支持。</p>",nl,O,it="<code>torch.compile</code>会跟踪输入形状和条件,如果这些不同,它会重新编译模型。例如,如果模型是在1024x1024分辨率的图像上编译的,而在不同分辨率的图像上使用,就会触发重新编译。",al,ee,ot="为避免重新编译,添加<code>dynamic=True</code>以尝试生成更动态的内核,避免条件变化时重新编译。",pl,le,il,te,rt='指定<code>use_duck_shape=False</code>会指示编译器是否应使用相同的符号变量来表示相同大小的输入。更多详情请参阅此<a href="https://github.com/huggingface/diffusers/pull/11327#discussion_r2047659790" rel="nofollow">评论</a>。',ol,se,ft='并非所有模型都能开箱即用地从动态编译中受益,可能需要更改。参考此<a href="https://github.com/huggingface/diffusers/pull/11297/" rel="nofollow">PR</a>,它改进了<code>AuraFlowPipeline</code>的实现以受益于动态编译。',rl,ne,ct="如果动态编译对Diffusers模型的效果不如预期,请随时提出问题。",fl,ae,cl,pe,mt='<a href="https://docs.pytorch.org/tutorials/recipes/regional_compilation.html" rel="nofollow">区域编译</a>通过仅编译模型中<em>小而频繁重复的块</em>(通常是transformer层)来减少冷启动延迟,并为每个后续出现的块重用编译后的工件。对于许多diffusion架构,这提供了与全图编译相同的运行时加速,并将编译时间减少了8-10倍。',ml,ie,ut="使用<code>compile_repeated_blocks()</code>方法(一个包装<code>torch.compile</code>的辅助函数)在任何组件(如transformer模型)上,如下所示。",ul,oe,hl,re,ht="要为新模型启用区域编译,请在模型类中添加一个<code>_repeated_blocks</code>属性,包含您想要编译的块的类名(作为字符串)。",dl,fe,Ml,X,dt='<p>更多区域编译示例,请参阅参考<a href="https://github.com/huggingface/diffusers/pull/11705" rel="nofollow">PR</a>。</p>',bl,ce,Mt='<a href="https://huggingface.co/docs/accelerate/index" rel="nofollow">Accelerate</a>中还有一个<a href="https://github.com/huggingface/accelerate/blob/273799c85d849a1954a4f2e65767216eb37fa089/src/accelerate/utils/other.py#L78" rel="nofollow">compile_regions</a>方法,可以自动选择模型中的候选块进行编译。其余图会单独编译。这对于快速实验很有用,因为您不需要设置哪些块要编译或调整编译标志。',Tl,me,wl,ue,bt="<code>compile_repeated_blocks()</code>是故意显式的。在<code>_repeated_blocks</code>中列出要重复的块,辅助函数仅编译这些块。它提供了可预测的行为,并且只需一行代码即可轻松推理缓存重用。",yl,he,Jl,de,Tt="在torch.compile中指定<code>fullgraph=True</code>非常重要,以确保底层模型中没有图中断。这使您可以充分利用torch.compile而不会降低性能。对于UNet和VAE,这会改变您访问返回变量的方式。",jl,Me,$l,be,Ul,Te,wt='每次去噪器做出预测后,调度器的<code>step()</code>函数会被<a href="https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L1228" rel="nofollow">调用</a>,并且<code>sigmas</code>变量会被<a href="https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/schedulers/scheduling_euler_discrete.py#L476" rel="nofollow">索引</a>。当放在GPU上时,这会引入延迟,因为CPU和GPU之间需要进行通信同步。当去噪器已经编译时,这一点会更加明显。',Zl,we,yt='一般来说,<code>sigmas</code>应该<a href="https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240" rel="nofollow">保持在CPU上</a>,以避免通信同步和延迟。',Gl,R,gl,ye,_l,Je,Jt='参阅<a href="https://huggingface.co/datasets/diffusers/benchmarks" rel="nofollow">diffusers/benchmarks</a>数据集,查看编译管道的推理延迟和内存使用数据。',Wl,je,jt='<a href="https://github.com/sayakpaul/diffusers-torchao#benchmarking-results" rel="nofollow">diffusers-torchao</a>仓库还包含Flux和CogVideoX编译版本的基准测试结果。',Bl,$e,xl,Ue,$t='<a href="https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html" rel="nofollow">动态量化</a>通过降低精度以加快数学运算来提高推理速度。这种特定类型的量化在运行时根据数据确定如何缩放激活,而不是使用固定的缩放因子。因此,缩放因子与数据更准确地匹配。',Cl,Ze,Ut='以下示例使用<a href="../quantization/torchao">torchao</a>库对UNet和VAE应用<a href="https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html" rel="nofollow">动态int8量化</a>。',vl,I,Zt='<p>参阅我们的<a href="../quantization/torchao">torchao</a>文档,了解更多关于如何使用Diffusers torchao集成的信息。</p>',kl,Ge,Gt="配置编译器标志以获得最大速度。",Xl,ge,Rl,_e,gt='使用<a href="https://github.com/huggingface/diffusion-fast/blob/0f169640b1db106fe6a479f78c1ed3bfaeba3386/utils/pipeline_utils.py#L16" rel="nofollow">dynamic_quant_filter_fn</a>过滤掉UNet和VAE中一些不会从动态量化中受益的线性层。',Il,We,Vl,Be,Ll,xe,_t='<p>[!WARNING][fuse_qkv_projections](<a href="https://github.com/huggingface/diffusers/blob/58431f102cf39c3c8a569f32d71b2ea8caa461e1/src/diffusers/pipelines/pipeline_utils.py#L2034)%E6%96%B9%E6%B3%95%E6%98%AF%E5%AE%9E%E9%AA%8C%E6%80%A7%E7%9A%84%EF%BC%8C%E7%9B%AE%E5%89%8D%E4%B8%BB%E8%A6%81%E6%94%AF%E6%8C%81Stable" rel="nofollow">https://github.com/huggingface/diffusers/blob/58431f102cf39c3c8a569f32d71b2ea8caa461e1/src/diffusers/pipelines/pipeline_utils.py#L2034)方法是实验性的,目前主要支持Stable</a> Diffusion管道。参阅此<a href="https://github.com/huggingface/diffusers/pull/6179" rel="nofollow">PR</a>了解如何为其他管道启用它。</p>',Hl,Ce,Wt="在注意力块中,输入被投影到三个子空间,分别由投影矩阵Q、K和V表示。这些投影通常单独计算,但您可以水平组合这些矩阵为一个矩阵,并在单步中执行投影。这会增加输入投影的矩阵乘法大小,并提高量化的效果。",El,ve,Ql,ke,Nl,Xe,Bt='<li><p>阅读<a href="https://pytorch.org/blog/presenting-flux-fast-making-flux-go-brrr-on-h100s/" rel="nofollow">Presenting Flux Fast: Making Flux go brrr on H100s</a>博客文章,了解如何结合所有这些优化与<a href="https://docs.pytorch.org/docs/stable/torch.compiler.html" rel="nofollow">TorchInductor</a>和<a href="https://docs.pytorch.org/docs/stable/torch.compiler_aot_inductor.html" rel="nofollow">AOTInductor</a>,使用<a href="https://github.com/huggingface/flux-fast" rel="nofollow">flux-fast</a>的配方获得约2.5倍的加速。</p> <p>这些配方支持AMD硬件和<a href="https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev" rel="nofollow">Flux.1 Kontext Dev</a>。</p></li> <li><p>阅读<a href="https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/" rel="nofollow">torch.compile和Diffusers:峰值性能实践指南</a>博客文章,了解如何在使用<code>torch.compile</code>时最大化性能。</p></li>',Fl,Re,Yl,Ve,Sl;return y=new W({props:{title:"加速推理",local:"加速推理",headingTag:"h1"}}),V=new W({props:{title:"模型数据类型",local:"模型数据类型",headingTag:"h2"}}),x=new Qt({props:{id:"dtypes",options:["bfloat16","float16","TensorFloat-32"],$$slots:{default:[St]},$$scope:{ctx:_}}}),H=new W({props:{title:"缩放点积注意力",local:"缩放点积注意力",headingTag:"h2"}}),N=new G({props:{code:"ZnJvbSUyMHRvcmNoLm5uLmF0dGVudGlvbiUyMGltcG9ydCUyMFNEUEJhY2tlbmQlMkMlMjBzZHBhX2tlcm5lbCUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFN0YWJsZURpZmZ1c2lvblhMUGlwZWxpbmUlMEElMEFwaXBlbGluZSUyMCUzRCUyMFN0YWJsZURpZmZ1c2lvblhMUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMnN0YWJpbGl0eWFpJTJGc3RhYmxlLWRpZmZ1c2lvbi14bC1iYXNlLTEuMCUyMiUyQyUyMHRvcmNoX2R0eXBlJTNEdG9yY2guYmZsb2F0MTYlMEEpLnRvKCUyMmN1ZGElMjIpJTBBJTBBcHJvbXB0JTIwJTNEJTIwJTIyQXN0cm9uYXV0JTIwaW4lMjBhJTIwanVuZ2xlJTJDJTIwY29sZCUyMGNvbG9yJTIwcGFsZXR0ZSUyQyUyMG11dGVkJTIwY29sb3JzJTJDJTIwZGV0YWlsZWQlMkMlMjA4ayUyMiUwQSUwQXdpdGglMjBzZHBhX2tlcm5lbChTRFBCYWNrZW5kLkVGRklDSUVOVF9BVFRFTlRJT04pJTNBJTBBJTIwJTIwaW1hZ2UlMjAlM0QlMjBwaXBlbGluZShwcm9tcHQlMkMlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMzApLmltYWdlcyU1QjAlNUQ=",highlighted:`<span class="hljs-keyword">from</span> torch.nn.attention <span class="hljs-keyword">import</span> SDPBackend, sdpa_kernel
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
<span class="hljs-keyword">with</span> sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
image = pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),F=new W({props:{title:"torch.compile",local:"torchcompile",headingTag:"h2"}}),z=new G({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuY29udl8xeDFfYXNfbW0lMjAlM0QlMjBUcnVlJTBBdG9yY2guX2luZHVjdG9yLmNvbmZpZy5jb29yZGluYXRlX2Rlc2NlbnRfdHVuaW5nJTIwJTNEJTIwVHJ1ZSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuZXBpbG9ndWVfZnVzaW9uJTIwJTNEJTIwRmFsc2UlMEF0b3JjaC5faW5kdWN0b3IuY29uZmlnLmNvb3JkaW5hdGVfZGVzY2VudF9jaGVja19hbGxfZGlyZWN0aW9ucyUyMCUzRCUyMFRydWU=",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
torch._inductor.config.conv_1x1_as_mm = <span class="hljs-literal">True</span>
torch._inductor.config.coordinate_descent_tuning = <span class="hljs-literal">True</span>
torch._inductor.config.epilogue_fusion = <span class="hljs-literal">False</span>
torch._inductor.config.coordinate_descent_check_all_directions = <span class="hljs-literal">True</span>`,wrap:!1}}),P=new G({props:{code:"cGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMEEpLnRvKCUyMmN1ZGElMjIpJTBBcGlwZWxpbmUudW5ldC50byhtZW1vcnlfZm9ybWF0JTNEdG9yY2guY2hhbm5lbHNfbGFzdCklMEFwaXBlbGluZS52YWUudG8obWVtb3J5X2Zvcm1hdCUzRHRvcmNoLmNoYW5uZWxzX2xhc3QpJTBBcGlwZWxpbmUudW5ldCUyMCUzRCUyMHRvcmNoLmNvbXBpbGUoJTBBJTIwJTIwJTIwJTIwcGlwZWxpbmUudW5ldCUyQyUyMG1vZGUlM0QlMjJtYXgtYXV0b3R1bmUlMjIlMkMlMjBmdWxsZ3JhcGglM0RUcnVlJTBBKSUwQXBpcGVsaW5lLnZhZS5kZWNvZGUlMjAlM0QlMjB0b3JjaC5jb21waWxlKCUwQSUyMCUyMCUyMCUyMHBpcGVsaW5lLnZhZS5kZWNvZGUlMkMlMEElMjAlMjAlMjAlMjBtb2RlJTNEJTIybWF4LWF1dG90dW5lJTIyJTJDJTBBJTIwJTIwJTIwJTIwZnVsbGdyYXBoJTNEVHJ1ZSUwQSklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJBc3Ryb25hdXQlMjBpbiUyMGElMjBqdW5nbGUlMkMlMjBjb2xkJTIwY29sb3IlMjBwYWxldHRlJTJDJTIwbXV0ZWQlMjBjb2xvcnMlMkMlMjBkZXRhaWxlZCUyQyUyMDhrJTIyJTBBcGlwZWxpbmUocHJvbXB0JTJDJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDMwKS5pbWFnZXMlNUIwJTVE",highlighted:`pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
pipeline.unet.to(memory_format=torch.channels_last)
pipeline.vae.to(memory_format=torch.channels_last)
pipeline.unet = torch.<span class="hljs-built_in">compile</span>(
pipeline.unet, mode=<span class="hljs-string">&quot;max-autotune&quot;</span>, fullgraph=<span class="hljs-literal">True</span>
)
pipeline.vae.decode = torch.<span class="hljs-built_in">compile</span>(
pipeline.vae.decode,
mode=<span class="hljs-string">&quot;max-autotune&quot;</span>,
fullgraph=<span class="hljs-literal">True</span>
)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),K=new W({props:{title:"动态形状编译",local:"动态形状编译",headingTag:"h3"}}),le=new G({props:{code:"JTJCJTIwdG9yY2guZnguZXhwZXJpbWVudGFsLl9jb25maWcudXNlX2R1Y2tfc2hhcGUlMjAlM0QlMjBGYWxzZSUwQSUyQiUyMHBpcGVsaW5lLnVuZXQlMjAlM0QlMjB0b3JjaC5jb21waWxlKCUwQSUyMCUyMCUyMCUyMHBpcGVsaW5lLnVuZXQlMkMlMjBmdWxsZ3JhcGglM0RUcnVlJTJDJTIwZHluYW1pYyUzRFRydWUlMEEp",highlighted:`<span class="hljs-addition">+ torch.fx.experimental._config.use_duck_shape = False</span>
<span class="hljs-addition">+ pipeline.unet = torch.compile(</span>
pipeline.unet, fullgraph=True, dynamic=True
)`,wrap:!1}}),ae=new W({props:{title:"区域编译",local:"区域编译",headingTag:"h3"}}),oe=new G({props:{code:"JTIzJTIwcGlwJTIwaW5zdGFsbCUyMC1VJTIwZGlmZnVzZXJzJTBBaW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmlsaXR5YWklMkZzdGFibGUtZGlmZnVzaW9uLXhsLWJhc2UtMS4wJTIyJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTJDJTBBKS50byglMjJjdWRhJTIyKSUwQSUwQSUyMyUyMCVFNCVCQiU4NSVFNyVCQyU5NiVFOCVBRiU5MVVOZXQlRTQlQjglQUQlRTklODclOEQlRTUlQTQlOEQlRTclOUElODR0cmFuc2Zvcm1lciVFNSVCMSU4MiUwQXBpcGVsaW5lLnVuZXQuY29tcGlsZV9yZXBlYXRlZF9ibG9ja3MoZnVsbGdyYXBoJTNEVHJ1ZSk=",highlighted:`<span class="hljs-comment"># pip install -U diffusers</span>
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>,
torch_dtype=torch.float16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-comment"># 仅编译UNet中重复的transformer层</span>
pipeline.unet.compile_repeated_blocks(fullgraph=<span class="hljs-literal">True</span>)`,wrap:!1}}),fe=new G({props:{code:"Y2xhc3MlMjBNeVVOZXQoTW9kZWxNaXhpbiklM0ElMEElMjAlMjAlMjAlMjBfcmVwZWF0ZWRfYmxvY2tzJTIwJTNEJTIwKCUyMlRyYW5zZm9ybWVyMkRNb2RlbCUyMiUyQyklMjAlMjAlMjMlMjAlRTIlODYlOTAlMjAlRTklQkIlOTglRTglQUUlQTQlRTclQkMlOTYlRTglQUYlOTE=",highlighted:`<span class="hljs-keyword">class</span> <span class="hljs-title class_">MyUNet</span>(<span class="hljs-title class_ inherited__">ModelMixin</span>):
_repeated_blocks = (<span class="hljs-string">&quot;Transformer2DModel&quot;</span>,) <span class="hljs-comment"># ← 默认编译</span>`,wrap:!1}}),me=new G({props:{code:"JTIzJTIwcGlwJTIwaW5zdGFsbCUyMC1VJTIwYWNjZWxlcmF0ZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFN0YWJsZURpZmZ1c2lvblhMUGlwZWxpbmUlMEFmcm9tJTIwYWNjZWxlcmF0ZS51dGlscyUyMGltcG9ydCUyMGNvbXBpbGUlMjByZWdpb25zJTBBJTBBcGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMEEpLnRvKCUyMmN1ZGElMjIpJTBBcGlwZWxpbmUudW5ldCUyMCUzRCUyMGNvbXBpbGVfcmVnaW9ucyhwaXBlbGluZS51bmV0JTJDJTIwbW9kZSUzRCUyMnJlZHVjZS1vdmVyaGVhZCUyMiUyQyUyMGZ1bGxncmFwaCUzRFRydWUp",highlighted:`<span class="hljs-comment"># pip install -U accelerate</span>
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
<span class="hljs-keyword">from</span> accelerate.utils <span class="hljs-keyword">import</span> <span class="hljs-built_in">compile</span> regions
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
pipeline.unet = compile_regions(pipeline.unet, mode=<span class="hljs-string">&quot;reduce-overhead&quot;</span>, fullgraph=<span class="hljs-literal">True</span>)`,wrap:!1}}),he=new W({props:{title:"图中断",local:"图中断",headingTag:"h3"}}),Me=new G({props:{code:"LSUyMGxhdGVudHMlMjAlM0QlMjB1bmV0KCUwQS0lMjAlMjAlMjBsYXRlbnRzJTJDJTIwdGltZXN0ZXAlM0R0aW1lc3RlcCUyQyUyMGVuY29kZXJfaGlkZGVuX3N0YXRlcyUzRHByb21wdF9lbWJlZHMlMEEtKS5zYW1wbGUlMEElMEElMkIlMjBsYXRlbnRzJTIwJTNEJTIwdW5ldCglMEElMkIlMjAlMjAlMjBsYXRlbnRzJTJDJTIwdGltZXN0ZXAlM0R0aW1lc3RlcCUyQyUyMGVuY29kZXJfaGlkZGVuX3N0YXRlcyUzRHByb21wdF9lbWJlZHMlMkMlMjByZXR1cm5fZGljdCUzREZhbHNlJTBBJTJCKSU1QjAlNUQ=",highlighted:`<span class="hljs-deletion">- latents = unet(</span>
<span class="hljs-deletion">- latents, timestep=timestep, encoder_hidden_states=prompt_embeds</span>
<span class="hljs-deletion">-).sample</span>
<span class="hljs-addition">+ latents = unet(</span>
<span class="hljs-addition">+ latents, timestep=timestep, encoder_hidden_states=prompt_embeds, return_dict=False</span>
<span class="hljs-addition">+)[0]</span>`,wrap:!1}}),be=new W({props:{title:"GPU同步",local:"gpu同步",headingTag:"h3"}}),R=new Ht({props:{$$slots:{default:[zt]},$$scope:{ctx:_}}}),ye=new W({props:{title:"基准测试",local:"基准测试",headingTag:"h3"}}),$e=new W({props:{title:"动态量化",local:"动态量化",headingTag:"h2"}}),ge=new G({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwdG9yY2hhbyUyMGltcG9ydCUyMGFwcGx5X2R5bmFtaWNfcXVhbnQlMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuY29udl8xeDFfYXNfbW0lMjAlM0QlMjBUcnVlJTBBdG9yY2guX2luZHVjdG9yLmNvbmZpZy5jb29yZGluYXRlX2Rlc2NlbnRfdHVuaW5nJTIwJTNEJTIwVHJ1ZSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuZXBpbG9ndWVfZnVzaW9uJTIwJTNEJTIwRmFsc2UlMEF0b3JjaC5faW5kdWN0b3IuY29uZmlnLmNvb3JkaW5hdGVfZGVzY2VudF9jaGVja19hbGxfZGlyZWN0aW9ucyUyMCUzRCUyMFRydWUlMEF0b3JjaC5faW5kdWN0b3IuY29uZmlnLmZvcmNlX2Z1c2VfaW50X21tX3dpdGhfbXVsJTIwJTNEJTIwVHJ1ZSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcudXNlX21peGVkX21tJTIwJTNEJTIwVHJ1ZQ==",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> torchao <span class="hljs-keyword">import</span> apply_dynamic_quant
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
torch._inductor.config.conv_1x1_as_mm = <span class="hljs-literal">True</span>
torch._inductor.config.coordinate_descent_tuning = <span class="hljs-literal">True</span>
torch._inductor.config.epilogue_fusion = <span class="hljs-literal">False</span>
torch._inductor.config.coordinate_descent_check_all_directions = <span class="hljs-literal">True</span>
torch._inductor.config.force_fuse_int_mm_with_mul = <span class="hljs-literal">True</span>
torch._inductor.config.use_mixed_mm = <span class="hljs-literal">True</span>`,wrap:!1}}),We=new G({props:{code:"cGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmJmbG9hdDE2JTBBKS50byglMjJjdWRhJTIyKSUwQSUwQWFwcGx5X2R5bmFtaWNfcXVhbnQocGlwZWxpbmUudW5ldCUyQyUyMGR5bmFtaWNfcXVhbnRfZmlsdGVyX2ZuKSUwQWFwcGx5X2R5bmFtaWNfcXVhbnQocGlwZWxpbmUudmFlJTJDJTIwZHluYW1pY19xdWFudF9maWx0ZXJfZm4pJTBBJTBBcHJvbXB0JTIwJTNEJTIwJTIyQXN0cm9uYXV0JTIwaW4lMjBhJTIwanVuZ2xlJTJDJTIwY29sZCUyMGNvbG9yJTIwcGFsZXR0ZSUyQyUyMG11dGVkJTIwY29sb3JzJTJDJTIwZGV0YWlsZWQlMkMlMjA4ayUyMiUwQXBpcGVsaW5lKHByb21wdCUyQyUyMG51bV9pbmZlcmVuY2Vfc3RlcHMlM0QzMCkuaW1hZ2VzJTVCMCU1RA==",highlighted:`pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
apply_dynamic_quant(pipeline.unet, dynamic_quant_filter_fn)
apply_dynamic_quant(pipeline.vae, dynamic_quant_filter_fn)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),Be=new W({props:{title:"融合投影矩阵",local:"融合投影矩阵",headingTag:"h2"}}),ve=new G({props:{code:"cGlwZWxpbmUuZnVzZV9xa3ZfcHJvamVjdGlvbnMoKQ==",highlighted:"pipeline.fuse_qkv_projections()",wrap:!1}}),ke=new W({props:{title:"资源",local:"资源",headingTag:"h2"}}),Re=new Et({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/zh/optimization/fp16.md"}}),{c(){i=o("meta"),j=n(),m=o("p"),u=n(),h(y.$$.fragment),f=n(),p=o("p"),p.innerHTML=$,U=n(),g=o("p"),g.textContent=B,J=n(),Z=o("p"),Z.textContent=ql,Le=n(),h(V.$$.fragment),He=n(),L=o("p"),L.textContent=Pl,Ee=n(),h(x.$$.fragment),Qe=n(),h(H.$$.fragment),Ne=n(),C=o("blockquote"),C.innerHTML=Dl,Fe=n(),E=o("p"),E.innerHTML=Kl,Ye=n(),Q=o("p"),Q.innerHTML=Ol,Se=n(),h(N.$$.fragment),ze=n(),h(F.$$.fragment),Ae=n(),Y=o("p"),Y.innerHTML=et,qe=n(),S=o("p"),S.innerHTML=lt,Pe=n(),h(z.$$.fragment),De=n(),A=o("p"),A.innerHTML=tt,Ke=n(),v=o("blockquote"),v.innerHTML=st,Oe=n(),q=o("p"),q.innerHTML=nt,el=n(),h(P.$$.fragment),ll=n(),D=o("p"),D.textContent=at,tl=n(),h(K.$$.fragment),sl=n(),k=o("blockquote"),k.innerHTML=pt,nl=n(),O=o("p"),O.innerHTML=it,al=n(),ee=o("p"),ee.innerHTML=ot,pl=n(),h(le.$$.fragment),il=n(),te=o("p"),te.innerHTML=rt,ol=n(),se=o("p"),se.innerHTML=ft,rl=n(),ne=o("p"),ne.textContent=ct,fl=n(),h(ae.$$.fragment),cl=n(),pe=o("p"),pe.innerHTML=mt,ml=n(),ie=o("p"),ie.innerHTML=ut,ul=n(),h(oe.$$.fragment),hl=n(),re=o("p"),re.innerHTML=ht,dl=n(),h(fe.$$.fragment),Ml=n(),X=o("blockquote"),X.innerHTML=dt,bl=n(),ce=o("p"),ce.innerHTML=Mt,Tl=n(),h(me.$$.fragment),wl=n(),ue=o("p"),ue.innerHTML=bt,yl=n(),h(he.$$.fragment),Jl=n(),de=o("p"),de.innerHTML=Tt,jl=n(),h(Me.$$.fragment),$l=n(),h(be.$$.fragment),Ul=n(),Te=o("p"),Te.innerHTML=wt,Zl=n(),we=o("p"),we.innerHTML=yt,Gl=n(),h(R.$$.fragment),gl=n(),h(ye.$$.fragment),_l=n(),Je=o("p"),Je.innerHTML=Jt,Wl=n(),je=o("p"),je.innerHTML=jt,Bl=n(),h($e.$$.fragment),xl=n(),Ue=o("p"),Ue.innerHTML=$t,Cl=n(),Ze=o("p"),Ze.innerHTML=Ut,vl=n(),I=o("blockquote"),I.innerHTML=Zt,kl=n(),Ge=o("p"),Ge.textContent=Gt,Xl=n(),h(ge.$$.fragment),Rl=n(),_e=o("p"),_e.innerHTML=gt,Il=n(),h(We.$$.fragment),Vl=n(),h(Be.$$.fragment),Ll=n(),xe=o("blockquote"),xe.innerHTML=_t,Hl=n(),Ce=o("p"),Ce.textContent=Wt,El=n(),h(ve.$$.fragment),Ql=n(),h(ke.$$.fragment),Nl=n(),Xe=o("ul"),Xe.innerHTML=Bt,Fl=n(),h(Re.$$.fragment),Yl=n(),Ve=o("p"),this.h()},l(e){const l=Vt("svelte-u9bgzb",document.head);i=r(l,"META",{name:!0,content:!0}),l.forEach(t),j=a(e),m=r(e,"P",{}),vt(m).forEach(t),u=a(e),d(y.$$.fragment,e),f=a(e),p=r(e,"P",{"data-svelte-h":!0}),c(p)!=="svelte-194j5uc"&&(p.innerHTML=$),U=a(e),g=r(e,"P",{"data-svelte-h":!0}),c(g)!=="svelte-k2hqss"&&(g.textContent=B),J=a(e),Z=r(e,"P",{"data-svelte-h":!0}),c(Z)!=="svelte-x0c89o"&&(Z.textContent=ql),Le=a(e),d(V.$$.fragment,e),He=a(e),L=r(e,"P",{"data-svelte-h":!0}),c(L)!=="svelte-py0rfh"&&(L.textContent=Pl),Ee=a(e),d(x.$$.fragment,e),Qe=a(e),d(H.$$.fragment,e),Ne=a(e),C=r(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),c(C)!=="svelte-8dllu1"&&(C.innerHTML=Dl),Fe=a(e),E=r(e,"P",{"data-svelte-h":!0}),c(E)!=="svelte-3w1pt5"&&(E.innerHTML=Kl),Ye=a(e),Q=r(e,"P",{"data-svelte-h":!0}),c(Q)!=="svelte-qbrif4"&&(Q.innerHTML=Ol),Se=a(e),d(N.$$.fragment,e),ze=a(e),d(F.$$.fragment,e),Ae=a(e),Y=r(e,"P",{"data-svelte-h":!0}),c(Y)!=="svelte-1pe18dl"&&(Y.innerHTML=et),qe=a(e),S=r(e,"P",{"data-svelte-h":!0}),c(S)!=="svelte-g36iym"&&(S.innerHTML=lt),Pe=a(e),d(z.$$.fragment,e),De=a(e),A=r(e,"P",{"data-svelte-h":!0}),c(A)!=="svelte-1wsgro0"&&(A.innerHTML=tt),Ke=a(e),v=r(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),c(v)!=="svelte-1fkr80r"&&(v.innerHTML=st),Oe=a(e),q=r(e,"P",{"data-svelte-h":!0}),c(q)!=="svelte-9ijocs"&&(q.innerHTML=nt),el=a(e),d(P.$$.fragment,e),ll=a(e),D=r(e,"P",{"data-svelte-h":!0}),c(D)!=="svelte-15it70t"&&(D.textContent=at),tl=a(e),d(K.$$.fragment,e),sl=a(e),k=r(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),c(k)!=="svelte-16e1eoe"&&(k.innerHTML=pt),nl=a(e),O=r(e,"P",{"data-svelte-h":!0}),c(O)!=="svelte-1ahxi03"&&(O.innerHTML=it),al=a(e),ee=r(e,"P",{"data-svelte-h":!0}),c(ee)!=="svelte-169dym5"&&(ee.innerHTML=ot),pl=a(e),d(le.$$.fragment,e),il=a(e),te=r(e,"P",{"data-svelte-h":!0}),c(te)!=="svelte-jmoihs"&&(te.innerHTML=rt),ol=a(e),se=r(e,"P",{"data-svelte-h":!0}),c(se)!=="svelte-w2qqjo"&&(se.innerHTML=ft),rl=a(e),ne=r(e,"P",{"data-svelte-h":!0}),c(ne)!=="svelte-dsv3x4"&&(ne.textContent=ct),fl=a(e),d(ae.$$.fragment,e),cl=a(e),pe=r(e,"P",{"data-svelte-h":!0}),c(pe)!=="svelte-1nyxskn"&&(pe.innerHTML=mt),ml=a(e),ie=r(e,"P",{"data-svelte-h":!0}),c(ie)!=="svelte-1qqd039"&&(ie.innerHTML=ut),ul=a(e),d(oe.$$.fragment,e),hl=a(e),re=r(e,"P",{"data-svelte-h":!0}),c(re)!=="svelte-wmi0r6"&&(re.innerHTML=ht),dl=a(e),d(fe.$$.fragment,e),Ml=a(e),X=r(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),c(X)!=="svelte-1tna39"&&(X.innerHTML=dt),bl=a(e),ce=r(e,"P",{"data-svelte-h":!0}),c(ce)!=="svelte-6v6dgf"&&(ce.innerHTML=Mt),Tl=a(e),d(me.$$.fragment,e),wl=a(e),ue=r(e,"P",{"data-svelte-h":!0}),c(ue)!=="svelte-zlqqst"&&(ue.innerHTML=bt),yl=a(e),d(he.$$.fragment,e),Jl=a(e),de=r(e,"P",{"data-svelte-h":!0}),c(de)!=="svelte-qa92su"&&(de.innerHTML=Tt),jl=a(e),d(Me.$$.fragment,e),$l=a(e),d(be.$$.fragment,e),Ul=a(e),Te=r(e,"P",{"data-svelte-h":!0}),c(Te)!=="svelte-jzcid"&&(Te.innerHTML=wt),Zl=a(e),we=r(e,"P",{"data-svelte-h":!0}),c(we)!=="svelte-11fadgl"&&(we.innerHTML=yt),Gl=a(e),d(R.$$.fragment,e),gl=a(e),d(ye.$$.fragment,e),_l=a(e),Je=r(e,"P",{"data-svelte-h":!0}),c(Je)!=="svelte-h31nps"&&(Je.innerHTML=Jt),Wl=a(e),je=r(e,"P",{"data-svelte-h":!0}),c(je)!=="svelte-oh3qub"&&(je.innerHTML=jt),Bl=a(e),d($e.$$.fragment,e),xl=a(e),Ue=r(e,"P",{"data-svelte-h":!0}),c(Ue)!=="svelte-1esyvy8"&&(Ue.innerHTML=$t),Cl=a(e),Ze=r(e,"P",{"data-svelte-h":!0}),c(Ze)!=="svelte-10dayln"&&(Ze.innerHTML=Ut),vl=a(e),I=r(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),c(I)!=="svelte-1xrzhvm"&&(I.innerHTML=Zt),kl=a(e),Ge=r(e,"P",{"data-svelte-h":!0}),c(Ge)!=="svelte-19bhy1d"&&(Ge.textContent=Gt),Xl=a(e),d(ge.$$.fragment,e),Rl=a(e),_e=r(e,"P",{"data-svelte-h":!0}),c(_e)!=="svelte-17j89ie"&&(_e.innerHTML=gt),Il=a(e),d(We.$$.fragment,e),Vl=a(e),d(Be.$$.fragment,e),Ll=a(e),xe=r(e,"BLOCKQUOTE",{"data-svelte-h":!0}),c(xe)!=="svelte-tty6r3"&&(xe.innerHTML=_t),Hl=a(e),Ce=r(e,"P",{"data-svelte-h":!0}),c(Ce)!=="svelte-qv5ldk"&&(Ce.textContent=Wt),El=a(e),d(ve.$$.fragment,e),Ql=a(e),d(ke.$$.fragment,e),Nl=a(e),Xe=r(e,"UL",{"data-svelte-h":!0}),c(Xe)!=="svelte-58n17n"&&(Xe.innerHTML=Bt),Fl=a(e),d(Re.$$.fragment,e),Yl=a(e),Ve=r(e,"P",{}),vt(Ve).forEach(t),this.h()},h(){Ie(i,"name","hf:doc:metadata"),Ie(i,"content",qt),Ie(C,"class","tip"),Ie(v,"class","tip"),Ie(k,"class","tip"),Ie(X,"class","tip"),Ie(I,"class","tip")},m(e,l){Lt(document.head,i),s(e,j,l),s(e,m,l),s(e,u,l),M(y,e,l),s(e,f,l),s(e,p,l),s(e,U,l),s(e,g,l),s(e,J,l),s(e,Z,l),s(e,Le,l),M(V,e,l),s(e,He,l),s(e,L,l),s(e,Ee,l),M(x,e,l),s(e,Qe,l),M(H,e,l),s(e,Ne,l),s(e,C,l),s(e,Fe,l),s(e,E,l),s(e,Ye,l),s(e,Q,l),s(e,Se,l),M(N,e,l),s(e,ze,l),M(F,e,l),s(e,Ae,l),s(e,Y,l),s(e,qe,l),s(e,S,l),s(e,Pe,l),M(z,e,l),s(e,De,l),s(e,A,l),s(e,Ke,l),s(e,v,l),s(e,Oe,l),s(e,q,l),s(e,el,l),M(P,e,l),s(e,ll,l),s(e,D,l),s(e,tl,l),M(K,e,l),s(e,sl,l),s(e,k,l),s(e,nl,l),s(e,O,l),s(e,al,l),s(e,ee,l),s(e,pl,l),M(le,e,l),s(e,il,l),s(e,te,l),s(e,ol,l),s(e,se,l),s(e,rl,l),s(e,ne,l),s(e,fl,l),M(ae,e,l),s(e,cl,l),s(e,pe,l),s(e,ml,l),s(e,ie,l),s(e,ul,l),M(oe,e,l),s(e,hl,l),s(e,re,l),s(e,dl,l),M(fe,e,l),s(e,Ml,l),s(e,X,l),s(e,bl,l),s(e,ce,l),s(e,Tl,l),M(me,e,l),s(e,wl,l),s(e,ue,l),s(e,yl,l),M(he,e,l),s(e,Jl,l),s(e,de,l),s(e,jl,l),M(Me,e,l),s(e,$l,l),M(be,e,l),s(e,Ul,l),s(e,Te,l),s(e,Zl,l),s(e,we,l),s(e,Gl,l),M(R,e,l),s(e,gl,l),M(ye,e,l),s(e,_l,l),s(e,Je,l),s(e,Wl,l),s(e,je,l),s(e,Bl,l),M($e,e,l),s(e,xl,l),s(e,Ue,l),s(e,Cl,l),s(e,Ze,l),s(e,vl,l),s(e,I,l),s(e,kl,l),s(e,Ge,l),s(e,Xl,l),M(ge,e,l),s(e,Rl,l),s(e,_e,l),s(e,Il,l),M(We,e,l),s(e,Vl,l),M(Be,e,l),s(e,Ll,l),s(e,xe,l),s(e,Hl,l),s(e,Ce,l),s(e,El,l),M(ve,e,l),s(e,Ql,l),M(ke,e,l),s(e,Nl,l),s(e,Xe,l),s(e,Fl,l),M(Re,e,l),s(e,Yl,l),s(e,Ve,l),Sl=!0},p(e,[l]){const xt={};l&2&&(xt.$$scope={dirty:l,ctx:e}),x.$set(xt);const Ct={};l&2&&(Ct.$$scope={dirty:l,ctx:e}),R.$set(Ct)},i(e){Sl||(b(y.$$.fragment,e),b(V.$$.fragment,e),b(x.$$.fragment,e),b(H.$$.fragment,e),b(N.$$.fragment,e),b(F.$$.fragment,e),b(z.$$.fragment,e),b(P.$$.fragment,e),b(K.$$.fragment,e),b(le.$$.fragment,e),b(ae.$$.fragment,e),b(oe.$$.fragment,e),b(fe.$$.fragment,e),b(me.$$.fragment,e),b(he.$$.fragment,e),b(Me.$$.fragment,e),b(be.$$.fragment,e),b(R.$$.fragment,e),b(ye.$$.fragment,e),b($e.$$.fragment,e),b(ge.$$.fragment,e),b(We.$$.fragment,e),b(Be.$$.fragment,e),b(ve.$$.fragment,e),b(ke.$$.fragment,e),b(Re.$$.fragment,e),Sl=!0)},o(e){T(y.$$.fragment,e),T(V.$$.fragment,e),T(x.$$.fragment,e),T(H.$$.fragment,e),T(N.$$.fragment,e),T(F.$$.fragment,e),T(z.$$.fragment,e),T(P.$$.fragment,e),T(K.$$.fragment,e),T(le.$$.fragment,e),T(ae.$$.fragment,e),T(oe.$$.fragment,e),T(fe.$$.fragment,e),T(me.$$.fragment,e),T(he.$$.fragment,e),T(Me.$$.fragment,e),T(be.$$.fragment,e),T(R.$$.fragment,e),T(ye.$$.fragment,e),T($e.$$.fragment,e),T(ge.$$.fragment,e),T(We.$$.fragment,e),T(Be.$$.fragment,e),T(ve.$$.fragment,e),T(ke.$$.fragment,e),T(Re.$$.fragment,e),Sl=!1},d(e){e&&(t(j),t(m),t(u),t(f),t(p),t(U),t(g),t(J),t(Z),t(Le),t(He),t(L),t(Ee),t(Qe),t(Ne),t(C),t(Fe),t(E),t(Ye),t(Q),t(Se),t(ze),t(Ae),t(Y),t(qe),t(S),t(Pe),t(De),t(A),t(Ke),t(v),t(Oe),t(q),t(el),t(ll),t(D),t(tl),t(sl),t(k),t(nl),t(O),t(al),t(ee),t(pl),t(il),t(te),t(ol),t(se),t(rl),t(ne),t(fl),t(cl),t(pe),t(ml),t(ie),t(ul),t(hl),t(re),t(dl),t(Ml),t(X),t(bl),t(ce),t(Tl),t(wl),t(ue),t(yl),t(Jl),t(de),t(jl),t($l),t(Ul),t(Te),t(Zl),t(we),t(Gl),t(gl),t(_l),t(Je),t(Wl),t(je),t(Bl),t(xl),t(Ue),t(Cl),t(Ze),t(vl),t(I),t(kl),t(Ge),t(Xl),t(Rl),t(_e),t(Il),t(Vl),t(Ll),t(xe),t(Hl),t(Ce),t(El),t(Ql),t(Nl),t(Xe),t(Fl),t(Yl),t(Ve)),t(i),w(y,e),w(V,e),w(x,e),w(H,e),w(N,e),w(F,e),w(z,e),w(P,e),w(K,e),w(le,e),w(ae,e),w(oe,e),w(fe,e),w(me,e),w(he,e),w(Me,e),w(be,e),w(R,e),w(ye,e),w($e,e),w(ge,e),w(We,e),w(Be,e),w(ve,e),w(ke,e),w(Re,e)}}}const qt='{"title":"加速推理","local":"加速推理","sections":[{"title":"模型数据类型","local":"模型数据类型","sections":[],"depth":2},{"title":"缩放点积注意力","local":"缩放点积注意力","sections":[],"depth":2},{"title":"torch.compile","local":"torchcompile","sections":[{"title":"动态形状编译","local":"动态形状编译","sections":[],"depth":3},{"title":"区域编译","local":"区域编译","sections":[],"depth":3},{"title":"图中断","local":"图中断","sections":[],"depth":3},{"title":"GPU同步","local":"gpu同步","sections":[],"depth":3},{"title":"基准测试","local":"基准测试","sections":[],"depth":3}],"depth":2},{"title":"动态量化","local":"动态量化","sections":[],"depth":2},{"title":"融合投影矩阵","local":"融合投影矩阵","sections":[],"depth":2},{"title":"资源","local":"资源","sections":[],"depth":2}],"depth":1}';function Pt(_){return Xt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class ss extends Rt{constructor(i){super(),It(this,i,Pt,At,kt,{})}}export{ss as component};

Xet Storage Details

Size:
42.9 kB
·
Xet hash:
2bd1acc50e50ed81a210a4c81b1224870c4a93cb7be5f62049ff8bb68d492811

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.