Buckets:

rtrm's picture
download
raw
43 kB
import{s as Rt,o as It,n as Pl}from"../chunks/scheduler.e4ff9b64.js";import{S as Vt,i as Lt,e as i,s as n,c as u,h as Ht,a as o,d as t,b as a,f as Xt,g as d,j as f,k as V,l as Et,m as s,n as h,t as M,o as b,p as T}from"../chunks/index.09f1bca0.js";import{C as Qt,H as g,E as Nt}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.ef7e5b9e.js";import{C as Z}from"../chunks/CodeBlock.0adc48e9.js";import{H as Ft,a as ql}from"../chunks/HfOption.44827c7f.js";function Yt(_){let c,U="bfloat16与float16类似,但对数值误差更稳健。硬件对bfloat16的支持各不相同,但大多数现代GPU都能支持bfloat16。",y,m,J;return m=new Z({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmlsaXR5YWklMkZzdGFibGUtZGlmZnVzaW9uLXhsLWJhc2UtMS4wJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUwQSkudG8oJTIyY3VkYSUyMiklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJBc3Ryb25hdXQlMjBpbiUyMGElMjBqdW5nbGUlMkMlMjBjb2xkJTIwY29sb3IlMjBwYWxldHRlJTJDJTIwbXV0ZWQlMjBjb2xvcnMlMkMlMjBkZXRhaWxlZCUyQyUyMDhrJTIyJTBBcGlwZWxpbmUocHJvbXB0JTJDJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDMwKS5pbWFnZXMlNUIwJTVE",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),{c(){c=i("p"),c.textContent=U,y=n(),u(m.$$.fragment)},l(r){c=o(r,"P",{"data-svelte-h":!0}),f(c)!=="svelte-1u2chbj"&&(c.textContent=U),y=a(r),d(m.$$.fragment,r)},m(r,p){s(r,c,p),s(r,y,p),h(m,r,p),J=!0},p:Pl,i(r){J||(M(m.$$.fragment,r),J=!0)},o(r){b(m.$$.fragment,r),J=!1},d(r){r&&(t(c),t(y)),T(m,r)}}}function St(_){let c,U="float16与bfloat16类似,但可能更容易出现数值误差。",y,m,J;return m=new Z({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmlsaXR5YWklMkZzdGFibGUtZGlmZnVzaW9uLXhsLWJhc2UtMS4wJTIyJTJDJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTBBKS50byglMjJjdWRhJTIyKSUwQSUwQXByb21wdCUyMCUzRCUyMCUyMkFzdHJvbmF1dCUyMGluJTIwYSUyMGp1bmdsZSUyQyUyMGNvbGQlMjBjb2xvciUyMHBhbGV0dGUlMkMlMjBtdXRlZCUyMGNvbG9ycyUyQyUyMGRldGFpbGVkJTJDJTIwOGslMjIlMEFwaXBlbGluZShwcm9tcHQlMkMlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMzApLmltYWdlcyU1QjAlNUQ=",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),{c(){c=i("p"),c.textContent=U,y=n(),u(m.$$.fragment)},l(r){c=o(r,"P",{"data-svelte-h":!0}),f(c)!=="svelte-1nw1ct0"&&(c.textContent=U),y=a(r),d(m.$$.fragment,r)},m(r,p){s(r,c,p),s(r,y,p),h(m,r,p),J=!0},p:Pl,i(r){J||(M(m.$$.fragment,r),J=!0)},o(r){b(m.$$.fragment,r),J=!1},d(r){r&&(t(c),t(y)),T(m,r)}}}function zt(_){let c,U='<a href="https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/" rel="nofollow">TensorFloat-32 (tf32)</a>模式在NVIDIA Ampere GPU上受支持,它以tf32计算卷积和矩阵乘法运算。存储和其他操作保持在float32。与bfloat16或float16结合使用时,可以显著加快计算速度。',y,m,J="PyTorch默认仅对卷积启用tf32模式,您需要显式启用矩阵乘法的tf32模式。",r,p,j,$,x='更多详情请参阅<a href="https://huggingface.co/docs/transformers/en/perf_train_gpu_one#mixed-precision" rel="nofollow">混合精度训练</a>文档。',G;return p=new Z({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXRvcmNoLmJhY2tlbmRzLmN1ZGEubWF0bXVsLmFsbG93X3RmMzIlMjAlM0QlMjBUcnVlJTBBJTBBcGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmJmbG9hdDE2JTBBKS50byglMjJjdWRhJTIyKSUwQSUwQXByb21wdCUyMCUzRCUyMCUyMkFzdHJvbmF1dCUyMGluJTIwYSUyMGp1bmdsZSUyQyUyMGNvbGQlMjBjb2xvciUyMHBhbGV0dGUlMkMlMjBtdXRlZCUyMGNvbG9ycyUyQyUyMGRldGFpbGVkJTJDJTIwOGslMjIlMEFwaXBlbGluZShwcm9tcHQlMkMlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMzApLmltYWdlcyU1QjAlNUQ=",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
torch.backends.cuda.matmul.allow_tf32 = <span class="hljs-literal">True</span>
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),{c(){c=i("p"),c.innerHTML=U,y=n(),m=i("p"),m.textContent=J,r=n(),u(p.$$.fragment),j=n(),$=i("p"),$.innerHTML=x},l(w){c=o(w,"P",{"data-svelte-h":!0}),f(c)!=="svelte-o6hhje"&&(c.innerHTML=U),y=a(w),m=o(w,"P",{"data-svelte-h":!0}),f(m)!=="svelte-1p6w426"&&(m.textContent=J),r=a(w),d(p.$$.fragment,w),j=a(w),$=o(w,"P",{"data-svelte-h":!0}),f($)!=="svelte-1v2g6n5"&&($.innerHTML=x)},m(w,W){s(w,c,W),s(w,y,W),s(w,m,W),s(w,r,W),h(p,w,W),s(w,j,W),s(w,$,W),G=!0},p:Pl,i(w){G||(M(p.$$.fragment,w),G=!0)},o(w){b(p.$$.fragment,w),G=!1},d(w){w&&(t(c),t(y),t(m),t(r),t(j),t($)),T(p,w)}}}function At(_){let c,U,y,m,J,r;return c=new ql({props:{id:"dtypes",option:"bfloat16",$$slots:{default:[Yt]},$$scope:{ctx:_}}}),y=new ql({props:{id:"dtypes",option:"float16",$$slots:{default:[St]},$$scope:{ctx:_}}}),J=new ql({props:{id:"dtypes",option:"TensorFloat-32",$$slots:{default:[zt]},$$scope:{ctx:_}}}),{c(){u(c.$$.fragment),U=n(),u(y.$$.fragment),m=n(),u(J.$$.fragment)},l(p){d(c.$$.fragment,p),U=a(p),d(y.$$.fragment,p),m=a(p),d(J.$$.fragment,p)},m(p,j){h(c,p,j),s(p,U,j),h(y,p,j),s(p,m,j),h(J,p,j),r=!0},p(p,j){const $={};j&2&&($.$$scope={dirty:j,ctx:p}),c.$set($);const x={};j&2&&(x.$$scope={dirty:j,ctx:p}),y.$set(x);const G={};j&2&&(G.$$scope={dirty:j,ctx:p}),J.$set(G)},i(p){r||(M(c.$$.fragment,p),M(y.$$.fragment,p),M(J.$$.fragment,p),r=!0)},o(p){b(c.$$.fragment,p),b(y.$$.fragment,p),b(J.$$.fragment,p),r=!1},d(p){p&&(t(U),t(m)),T(c,p),T(y,p),T(J,p)}}}function qt(_){let c,U,y,m,J,r,p,j,$,x='Diffusion模型在推理时速度较慢,因为生成是一个迭代过程,需要经过一定数量的”步数”逐步将噪声细化为图像或视频。要加速这一过程,您可以尝试使用不同的<a href="../api/schedulers/overview">调度器</a>、降低模型权重的精度以加快计算、使用更高效的内存注意力机制等方法。',G,w,W="将这些技术组合使用,可以比单独使用任何一种技术获得更快的推理速度。",He,L,Dl="本指南将介绍如何加速推理。",Ee,H,Qe,E,Ol="模型权重的精度和数据类型会影响推理速度,因为更高的精度需要更多内存来加载,也需要更多时间进行计算。PyTorch默认以float32或全精度加载模型权重,因此更改数据类型是快速获得更快推理速度的简单方法。",Ne,B,Fe,Q,Ye,C,Kl='<p>内存高效注意力优化了推理速度<em>和</em><a href="./memory#memory-efficient-attention">内存使用</a>!</p>',Se,N,et='<a href="https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html" rel="nofollow">缩放点积注意力(SDPA)</a>实现了多种注意力后端,包括<a href="https://github.com/Dao-AILab/flash-attention" rel="nofollow">FlashAttention</a>、<a href="https://github.com/facebookresearch/xformers" rel="nofollow">xFormers</a>和原生C++实现。它会根据您的硬件自动选择最优的后端。',ze,F,lt='如果您使用的是PyTorch &gt;= 2.0,SDPA默认启用,无需对代码进行任何额外更改。不过,您也可以尝试使用其他注意力后端来自行选择。下面的示例使用<a href="https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html" rel="nofollow">torch.nn.attention.sdpa_kernel</a>上下文管理器来启用高效注意力。',Ae,Y,qe,S,Pe,z,tt='<a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a>通过将PyTorch代码和操作编译为优化的内核来加速推理。Diffusers通常会编译计算密集型的模型,如UNet、transformer或VAE。',De,A,st='启用以下编译器设置以获得最大速度(更多选项请参阅<a href="https://github.com/pytorch/pytorch/blob/main/torch/_inductor/config.py" rel="nofollow">完整列表</a>)。',Oe,q,Ke,P,nt="加载并编译UNet和VAE。有几种不同的模式可供选择,但<code>&quot;max-autotune&quot;</code>通过编译为CUDA图来优化速度。CUDA图通过单个CPU操作启动多个GPU操作,有效减少了开销。",el,k,at='<p>在PyTorch 2.3.1中,您可以控制torch.compile的缓存行为。这对于像<code>&quot;max-autotune&quot;</code>这样的编译模式特别有用,它会通过网格搜索多个编译标志来找到最优配置。更多详情请参阅<a href="https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html" rel="nofollow">torch.compile中的编译时间缓存</a>教程。</p>',ll,D,pt='将内存布局更改为<a href="./memory#torchchannels_last">channels_last</a>也可以优化内存和推理速度。',tl,O,sl,K,it="第一次编译时速度较慢,但一旦编译完成,速度会显著提升。尽量只在相同类型的推理操作上使用编译后的管道。在不同尺寸的图像上调用编译后的管道会重新触发编译,这会很慢且效率低下。",nl,ee,al,v,ot="<p>确保始终使用PyTorch的nightly版本以获得更好的支持。</p>",pl,le,rt="<code>torch.compile</code>会跟踪输入形状和条件,如果这些不同,它会重新编译模型。例如,如果模型是在1024x1024分辨率的图像上编译的,而在不同分辨率的图像上使用,就会触发重新编译。",il,te,ft="为避免重新编译,添加<code>dynamic=True</code>以尝试生成更动态的内核,避免条件变化时重新编译。",ol,se,rl,ne,ct='指定<code>use_duck_shape=False</code>会指示编译器是否应使用相同的符号变量来表示相同大小的输入。更多详情请参阅此<a href="https://github.com/huggingface/diffusers/pull/11327#discussion_r2047659790" rel="nofollow">评论</a>。',fl,ae,mt='并非所有模型都能开箱即用地从动态编译中受益,可能需要更改。参考此<a href="https://github.com/huggingface/diffusers/pull/11297/" rel="nofollow">PR</a>,它改进了<code>AuraFlowPipeline</code>的实现以受益于动态编译。',cl,pe,ut="如果动态编译对Diffusers模型的效果不如预期,请随时提出问题。",ml,ie,ul,oe,dt='<a href="https://docs.pytorch.org/tutorials/recipes/regional_compilation.html" rel="nofollow">区域编译</a>通过仅编译模型中<em>小而频繁重复的块</em>(通常是transformer层)来减少冷启动延迟,并为每个后续出现的块重用编译后的工件。对于许多diffusion架构,这提供了与全图编译相同的运行时加速,并将编译时间减少了8-10倍。',dl,re,ht="使用<code>compile_repeated_blocks()</code>方法(一个包装<code>torch.compile</code>的辅助函数)在任何组件(如transformer模型)上,如下所示。",hl,fe,Ml,ce,Mt="要为新模型启用区域编译,请在模型类中添加一个<code>_repeated_blocks</code>属性,包含您想要编译的块的类名(作为字符串)。",bl,me,Tl,X,bt='<p>更多区域编译示例,请参阅参考<a href="https://github.com/huggingface/diffusers/pull/11705" rel="nofollow">PR</a>。</p>',wl,ue,Tt='<a href="https://huggingface.co/docs/accelerate/index" rel="nofollow">Accelerate</a>中还有一个<a href="https://github.com/huggingface/accelerate/blob/273799c85d849a1954a4f2e65767216eb37fa089/src/accelerate/utils/other.py#L78" rel="nofollow">compile_regions</a>方法,可以自动选择模型中的候选块进行编译。其余图会单独编译。这对于快速实验很有用,因为您不需要设置哪些块要编译或调整编译标志。',yl,de,Jl,he,wt="<code>compile_repeated_blocks()</code>是故意显式的。在<code>_repeated_blocks</code>中列出要重复的块,辅助函数仅编译这些块。它提供了可预测的行为,并且只需一行代码即可轻松推理缓存重用。",jl,Me,Ul,be,yt="在torch.compile中指定<code>fullgraph=True</code>非常重要,以确保底层模型中没有图中断。这使您可以充分利用torch.compile而不会降低性能。对于UNet和VAE,这会改变您访问返回变量的方式。",$l,Te,Zl,we,Gl,ye,Jt='每次去噪器做出预测后,调度器的<code>step()</code>函数会被<a href="https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L1228" rel="nofollow">调用</a>,并且<code>sigmas</code>变量会被<a href="https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/schedulers/scheduling_euler_discrete.py#L476" rel="nofollow">索引</a>。当放在GPU上时,这会引入延迟,因为CPU和GPU之间需要进行通信同步。当去噪器已经编译时,这一点会更加明显。',gl,Je,jt='一般来说,<code>sigmas</code>应该<a href="https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240" rel="nofollow">保持在CPU上</a>,以避免通信同步和延迟。',_l,R,Ut='<p>参阅<a href="https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/" rel="nofollow">torch.compile和Diffusers:峰值性能实践指南</a>博客文章,了解如何为扩散模型最大化<code>torch.compile</code>的性能。</p>',Wl,je,xl,Ue,$t='参阅<a href="https://huggingface.co/datasets/diffusers/benchmarks" rel="nofollow">diffusers/benchmarks</a>数据集,查看编译管道的推理延迟和内存使用数据。',Bl,$e,Zt='<a href="https://github.com/sayakpaul/diffusers-torchao#benchmarking-results" rel="nofollow">diffusers-torchao</a>仓库还包含Flux和CogVideoX编译版本的基准测试结果。',Cl,Ze,kl,Ge,Gt='<a href="https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html" rel="nofollow">动态量化</a>通过降低精度以加快数学运算来提高推理速度。这种特定类型的量化在运行时根据数据确定如何缩放激活,而不是使用固定的缩放因子。因此,缩放因子与数据更准确地匹配。',vl,ge,gt='以下示例使用<a href="../quantization/torchao">torchao</a>库对UNet和VAE应用<a href="https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html" rel="nofollow">动态int8量化</a>。',Xl,I,_t='<p>参阅我们的<a href="../quantization/torchao">torchao</a>文档,了解更多关于如何使用Diffusers torchao集成的信息。</p>',Rl,_e,Wt="配置编译器标志以获得最大速度。",Il,We,Vl,xe,xt='使用<a href="https://github.com/huggingface/diffusion-fast/blob/0f169640b1db106fe6a479f78c1ed3bfaeba3386/utils/pipeline_utils.py#L16" rel="nofollow">dynamic_quant_filter_fn</a>过滤掉UNet和VAE中一些不会从动态量化中受益的线性层。',Ll,Be,Hl,Ce,El,ke,Bt='<p>[!WARNING][fuse_qkv_projections](<a href="https://github.com/huggingface/diffusers/blob/58431f102cf39c3c8a569f32d71b2ea8caa461e1/src/diffusers/pipelines/pipeline_utils.py#L2034)%E6%96%B9%E6%B3%95%E6%98%AF%E5%AE%9E%E9%AA%8C%E6%80%A7%E7%9A%84%EF%BC%8C%E7%9B%AE%E5%89%8D%E4%B8%BB%E8%A6%81%E6%94%AF%E6%8C%81Stable" rel="nofollow">https://github.com/huggingface/diffusers/blob/58431f102cf39c3c8a569f32d71b2ea8caa461e1/src/diffusers/pipelines/pipeline_utils.py#L2034)方法是实验性的,目前主要支持Stable</a> Diffusion管道。参阅此<a href="https://github.com/huggingface/diffusers/pull/6179" rel="nofollow">PR</a>了解如何为其他管道启用它。</p>',Ql,ve,Ct="在注意力块中,输入被投影到三个子空间,分别由投影矩阵Q、K和V表示。这些投影通常单独计算,但您可以水平组合这些矩阵为一个矩阵,并在单步中执行投影。这会增加输入投影的矩阵乘法大小,并提高量化的效果。",Nl,Xe,Fl,Re,Yl,Ie,kt='<li><p>阅读<a href="https://pytorch.org/blog/presenting-flux-fast-making-flux-go-brrr-on-h100s/" rel="nofollow">Presenting Flux Fast: Making Flux go brrr on H100s</a>博客文章,了解如何结合所有这些优化与<a href="https://docs.pytorch.org/docs/stable/torch.compiler.html" rel="nofollow">TorchInductor</a>和<a href="https://docs.pytorch.org/docs/stable/torch.compiler_aot_inductor.html" rel="nofollow">AOTInductor</a>,使用<a href="https://github.com/huggingface/flux-fast" rel="nofollow">flux-fast</a>的配方获得约2.5倍的加速。</p> <p>这些配方支持AMD硬件和<a href="https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev" rel="nofollow">Flux.1 Kontext Dev</a>。</p></li> <li><p>阅读<a href="https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/" rel="nofollow">torch.compile和Diffusers:峰值性能实践指南</a>博客文章,了解如何在使用<code>torch.compile</code>时最大化性能。</p></li>',Sl,Ve,zl,Le,Al;return J=new Qt({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),p=new g({props:{title:"加速推理",local:"加速推理",headingTag:"h1"}}),H=new g({props:{title:"模型数据类型",local:"模型数据类型",headingTag:"h2"}}),B=new Ft({props:{id:"dtypes",options:["bfloat16","float16","TensorFloat-32"],$$slots:{default:[At]},$$scope:{ctx:_}}}),Q=new g({props:{title:"缩放点积注意力",local:"缩放点积注意力",headingTag:"h2"}}),Y=new Z({props:{code:"ZnJvbSUyMHRvcmNoLm5uLmF0dGVudGlvbiUyMGltcG9ydCUyMFNEUEJhY2tlbmQlMkMlMjBzZHBhX2tlcm5lbCUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFN0YWJsZURpZmZ1c2lvblhMUGlwZWxpbmUlMEElMEFwaXBlbGluZSUyMCUzRCUyMFN0YWJsZURpZmZ1c2lvblhMUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMnN0YWJpbGl0eWFpJTJGc3RhYmxlLWRpZmZ1c2lvbi14bC1iYXNlLTEuMCUyMiUyQyUyMHRvcmNoX2R0eXBlJTNEdG9yY2guYmZsb2F0MTYlMEEpLnRvKCUyMmN1ZGElMjIpJTBBJTBBcHJvbXB0JTIwJTNEJTIwJTIyQXN0cm9uYXV0JTIwaW4lMjBhJTIwanVuZ2xlJTJDJTIwY29sZCUyMGNvbG9yJTIwcGFsZXR0ZSUyQyUyMG11dGVkJTIwY29sb3JzJTJDJTIwZGV0YWlsZWQlMkMlMjA4ayUyMiUwQSUwQXdpdGglMjBzZHBhX2tlcm5lbChTRFBCYWNrZW5kLkVGRklDSUVOVF9BVFRFTlRJT04pJTNBJTBBJTIwJTIwaW1hZ2UlMjAlM0QlMjBwaXBlbGluZShwcm9tcHQlMkMlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMzApLmltYWdlcyU1QjAlNUQ=",highlighted:`<span class="hljs-keyword">from</span> torch.nn.attention <span class="hljs-keyword">import</span> SDPBackend, sdpa_kernel
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
<span class="hljs-keyword">with</span> sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
image = pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),S=new g({props:{title:"torch.compile",local:"torchcompile",headingTag:"h2"}}),q=new Z({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuY29udl8xeDFfYXNfbW0lMjAlM0QlMjBUcnVlJTBBdG9yY2guX2luZHVjdG9yLmNvbmZpZy5jb29yZGluYXRlX2Rlc2NlbnRfdHVuaW5nJTIwJTNEJTIwVHJ1ZSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuZXBpbG9ndWVfZnVzaW9uJTIwJTNEJTIwRmFsc2UlMEF0b3JjaC5faW5kdWN0b3IuY29uZmlnLmNvb3JkaW5hdGVfZGVzY2VudF9jaGVja19hbGxfZGlyZWN0aW9ucyUyMCUzRCUyMFRydWU=",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
torch._inductor.config.conv_1x1_as_mm = <span class="hljs-literal">True</span>
torch._inductor.config.coordinate_descent_tuning = <span class="hljs-literal">True</span>
torch._inductor.config.epilogue_fusion = <span class="hljs-literal">False</span>
torch._inductor.config.coordinate_descent_check_all_directions = <span class="hljs-literal">True</span>`,wrap:!1}}),O=new Z({props:{code:"cGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMEEpLnRvKCUyMmN1ZGElMjIpJTBBcGlwZWxpbmUudW5ldC50byhtZW1vcnlfZm9ybWF0JTNEdG9yY2guY2hhbm5lbHNfbGFzdCklMEFwaXBlbGluZS52YWUudG8obWVtb3J5X2Zvcm1hdCUzRHRvcmNoLmNoYW5uZWxzX2xhc3QpJTBBcGlwZWxpbmUudW5ldCUyMCUzRCUyMHRvcmNoLmNvbXBpbGUoJTBBJTIwJTIwJTIwJTIwcGlwZWxpbmUudW5ldCUyQyUyMG1vZGUlM0QlMjJtYXgtYXV0b3R1bmUlMjIlMkMlMjBmdWxsZ3JhcGglM0RUcnVlJTBBKSUwQXBpcGVsaW5lLnZhZS5kZWNvZGUlMjAlM0QlMjB0b3JjaC5jb21waWxlKCUwQSUyMCUyMCUyMCUyMHBpcGVsaW5lLnZhZS5kZWNvZGUlMkMlMEElMjAlMjAlMjAlMjBtb2RlJTNEJTIybWF4LWF1dG90dW5lJTIyJTJDJTBBJTIwJTIwJTIwJTIwZnVsbGdyYXBoJTNEVHJ1ZSUwQSklMEElMEFwcm9tcHQlMjAlM0QlMjAlMjJBc3Ryb25hdXQlMjBpbiUyMGElMjBqdW5nbGUlMkMlMjBjb2xkJTIwY29sb3IlMjBwYWxldHRlJTJDJTIwbXV0ZWQlMjBjb2xvcnMlMkMlMjBkZXRhaWxlZCUyQyUyMDhrJTIyJTBBcGlwZWxpbmUocHJvbXB0JTJDJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDMwKS5pbWFnZXMlNUIwJTVE",highlighted:`pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
pipeline.unet.to(memory_format=torch.channels_last)
pipeline.vae.to(memory_format=torch.channels_last)
pipeline.unet = torch.<span class="hljs-built_in">compile</span>(
pipeline.unet, mode=<span class="hljs-string">&quot;max-autotune&quot;</span>, fullgraph=<span class="hljs-literal">True</span>
)
pipeline.vae.decode = torch.<span class="hljs-built_in">compile</span>(
pipeline.vae.decode,
mode=<span class="hljs-string">&quot;max-autotune&quot;</span>,
fullgraph=<span class="hljs-literal">True</span>
)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),ee=new g({props:{title:"动态形状编译",local:"动态形状编译",headingTag:"h3"}}),se=new Z({props:{code:"JTJCJTIwdG9yY2guZnguZXhwZXJpbWVudGFsLl9jb25maWcudXNlX2R1Y2tfc2hhcGUlMjAlM0QlMjBGYWxzZSUwQSUyQiUyMHBpcGVsaW5lLnVuZXQlMjAlM0QlMjB0b3JjaC5jb21waWxlKCUwQSUyMCUyMCUyMCUyMHBpcGVsaW5lLnVuZXQlMkMlMjBmdWxsZ3JhcGglM0RUcnVlJTJDJTIwZHluYW1pYyUzRFRydWUlMEEp",highlighted:`<span class="hljs-addition">+ torch.fx.experimental._config.use_duck_shape = False</span>
<span class="hljs-addition">+ pipeline.unet = torch.compile(</span>
pipeline.unet, fullgraph=True, dynamic=True
)`,wrap:!1}}),ie=new g({props:{title:"区域编译",local:"区域编译",headingTag:"h3"}}),fe=new Z({props:{code:"JTIzJTIwcGlwJTIwaW5zdGFsbCUyMC1VJTIwZGlmZnVzZXJzJTBBaW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXBpcGVsaW5lJTIwJTNEJTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyc3RhYmlsaXR5YWklMkZzdGFibGUtZGlmZnVzaW9uLXhsLWJhc2UtMS4wJTIyJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5mbG9hdDE2JTJDJTBBKS50byglMjJjdWRhJTIyKSUwQSUwQSUyMyUyMCVFNCVCQiU4NSVFNyVCQyU5NiVFOCVBRiU5MVVOZXQlRTQlQjglQUQlRTklODclOEQlRTUlQTQlOEQlRTclOUElODR0cmFuc2Zvcm1lciVFNSVCMSU4MiUwQXBpcGVsaW5lLnVuZXQuY29tcGlsZV9yZXBlYXRlZF9ibG9ja3MoZnVsbGdyYXBoJTNEVHJ1ZSk=",highlighted:`<span class="hljs-comment"># pip install -U diffusers</span>
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>,
torch_dtype=torch.float16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-comment"># 仅编译UNet中重复的transformer层</span>
pipeline.unet.compile_repeated_blocks(fullgraph=<span class="hljs-literal">True</span>)`,wrap:!1}}),me=new Z({props:{code:"Y2xhc3MlMjBNeVVOZXQoTW9kZWxNaXhpbiklM0ElMEElMjAlMjAlMjAlMjBfcmVwZWF0ZWRfYmxvY2tzJTIwJTNEJTIwKCUyMlRyYW5zZm9ybWVyMkRNb2RlbCUyMiUyQyklMjAlMjAlMjMlMjAlRTIlODYlOTAlMjAlRTklQkIlOTglRTglQUUlQTQlRTclQkMlOTYlRTglQUYlOTE=",highlighted:`<span class="hljs-keyword">class</span> <span class="hljs-title class_">MyUNet</span>(<span class="hljs-title class_ inherited__">ModelMixin</span>):
_repeated_blocks = (<span class="hljs-string">&quot;Transformer2DModel&quot;</span>,) <span class="hljs-comment"># ← 默认编译</span>`,wrap:!1}}),de=new Z({props:{code:"JTIzJTIwcGlwJTIwaW5zdGFsbCUyMC1VJTIwYWNjZWxlcmF0ZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFN0YWJsZURpZmZ1c2lvblhMUGlwZWxpbmUlMEFmcm9tJTIwYWNjZWxlcmF0ZS51dGlscyUyMGltcG9ydCUyMGNvbXBpbGUlMjByZWdpb25zJTBBJTBBcGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMEEpLnRvKCUyMmN1ZGElMjIpJTBBcGlwZWxpbmUudW5ldCUyMCUzRCUyMGNvbXBpbGVfcmVnaW9ucyhwaXBlbGluZS51bmV0JTJDJTIwbW9kZSUzRCUyMnJlZHVjZS1vdmVyaGVhZCUyMiUyQyUyMGZ1bGxncmFwaCUzRFRydWUp",highlighted:`<span class="hljs-comment"># pip install -U accelerate</span>
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
<span class="hljs-keyword">from</span> accelerate.utils <span class="hljs-keyword">import</span> <span class="hljs-built_in">compile</span> regions
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
pipeline.unet = compile_regions(pipeline.unet, mode=<span class="hljs-string">&quot;reduce-overhead&quot;</span>, fullgraph=<span class="hljs-literal">True</span>)`,wrap:!1}}),Me=new g({props:{title:"图中断",local:"图中断",headingTag:"h3"}}),Te=new Z({props:{code:"LSUyMGxhdGVudHMlMjAlM0QlMjB1bmV0KCUwQS0lMjAlMjAlMjBsYXRlbnRzJTJDJTIwdGltZXN0ZXAlM0R0aW1lc3RlcCUyQyUyMGVuY29kZXJfaGlkZGVuX3N0YXRlcyUzRHByb21wdF9lbWJlZHMlMEEtKS5zYW1wbGUlMEElMEElMkIlMjBsYXRlbnRzJTIwJTNEJTIwdW5ldCglMEElMkIlMjAlMjAlMjBsYXRlbnRzJTJDJTIwdGltZXN0ZXAlM0R0aW1lc3RlcCUyQyUyMGVuY29kZXJfaGlkZGVuX3N0YXRlcyUzRHByb21wdF9lbWJlZHMlMkMlMjByZXR1cm5fZGljdCUzREZhbHNlJTBBJTJCKSU1QjAlNUQ=",highlighted:`<span class="hljs-deletion">- latents = unet(</span>
<span class="hljs-deletion">- latents, timestep=timestep, encoder_hidden_states=prompt_embeds</span>
<span class="hljs-deletion">-).sample</span>
<span class="hljs-addition">+ latents = unet(</span>
<span class="hljs-addition">+ latents, timestep=timestep, encoder_hidden_states=prompt_embeds, return_dict=False</span>
<span class="hljs-addition">+)[0]</span>`,wrap:!1}}),we=new g({props:{title:"GPU同步",local:"gpu同步",headingTag:"h3"}}),je=new g({props:{title:"基准测试",local:"基准测试",headingTag:"h3"}}),Ze=new g({props:{title:"动态量化",local:"动态量化",headingTag:"h2"}}),We=new Z({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwdG9yY2hhbyUyMGltcG9ydCUyMGFwcGx5X2R5bmFtaWNfcXVhbnQlMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwU3RhYmxlRGlmZnVzaW9uWExQaXBlbGluZSUwQSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuY29udl8xeDFfYXNfbW0lMjAlM0QlMjBUcnVlJTBBdG9yY2guX2luZHVjdG9yLmNvbmZpZy5jb29yZGluYXRlX2Rlc2NlbnRfdHVuaW5nJTIwJTNEJTIwVHJ1ZSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcuZXBpbG9ndWVfZnVzaW9uJTIwJTNEJTIwRmFsc2UlMEF0b3JjaC5faW5kdWN0b3IuY29uZmlnLmNvb3JkaW5hdGVfZGVzY2VudF9jaGVja19hbGxfZGlyZWN0aW9ucyUyMCUzRCUyMFRydWUlMEF0b3JjaC5faW5kdWN0b3IuY29uZmlnLmZvcmNlX2Z1c2VfaW50X21tX3dpdGhfbXVsJTIwJTNEJTIwVHJ1ZSUwQXRvcmNoLl9pbmR1Y3Rvci5jb25maWcudXNlX21peGVkX21tJTIwJTNEJTIwVHJ1ZQ==",highlighted:`<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> torchao <span class="hljs-keyword">import</span> apply_dynamic_quant
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
torch._inductor.config.conv_1x1_as_mm = <span class="hljs-literal">True</span>
torch._inductor.config.coordinate_descent_tuning = <span class="hljs-literal">True</span>
torch._inductor.config.epilogue_fusion = <span class="hljs-literal">False</span>
torch._inductor.config.coordinate_descent_check_all_directions = <span class="hljs-literal">True</span>
torch._inductor.config.force_fuse_int_mm_with_mul = <span class="hljs-literal">True</span>
torch._inductor.config.use_mixed_mm = <span class="hljs-literal">True</span>`,wrap:!1}}),Be=new Z({props:{code:"cGlwZWxpbmUlMjAlM0QlMjBTdGFibGVEaWZmdXNpb25YTFBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJzdGFiaWxpdHlhaSUyRnN0YWJsZS1kaWZmdXNpb24teGwtYmFzZS0xLjAlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmJmbG9hdDE2JTBBKS50byglMjJjdWRhJTIyKSUwQSUwQWFwcGx5X2R5bmFtaWNfcXVhbnQocGlwZWxpbmUudW5ldCUyQyUyMGR5bmFtaWNfcXVhbnRfZmlsdGVyX2ZuKSUwQWFwcGx5X2R5bmFtaWNfcXVhbnQocGlwZWxpbmUudmFlJTJDJTIwZHluYW1pY19xdWFudF9maWx0ZXJfZm4pJTBBJTBBcHJvbXB0JTIwJTNEJTIwJTIyQXN0cm9uYXV0JTIwaW4lMjBhJTIwanVuZ2xlJTJDJTIwY29sZCUyMGNvbG9yJTIwcGFsZXR0ZSUyQyUyMG11dGVkJTIwY29sb3JzJTJDJTIwZGV0YWlsZWQlMkMlMjA4ayUyMiUwQXBpcGVsaW5lKHByb21wdCUyQyUyMG51bV9pbmZlcmVuY2Vfc3RlcHMlM0QzMCkuaW1hZ2VzJTVCMCU1RA==",highlighted:`pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
apply_dynamic_quant(pipeline.unet, dynamic_quant_filter_fn)
apply_dynamic_quant(pipeline.vae, dynamic_quant_filter_fn)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]`,wrap:!1}}),Ce=new g({props:{title:"融合投影矩阵",local:"融合投影矩阵",headingTag:"h2"}}),Xe=new Z({props:{code:"cGlwZWxpbmUuZnVzZV9xa3ZfcHJvamVjdGlvbnMoKQ==",highlighted:"pipeline.fuse_qkv_projections()",wrap:!1}}),Re=new g({props:{title:"资源",local:"资源",headingTag:"h2"}}),Ve=new Nt({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/zh/optimization/fp16.md"}}),{c(){c=i("meta"),U=n(),y=i("p"),m=n(),u(J.$$.fragment),r=n(),u(p.$$.fragment),j=n(),$=i("p"),$.innerHTML=x,G=n(),w=i("p"),w.textContent=W,He=n(),L=i("p"),L.textContent=Dl,Ee=n(),u(H.$$.fragment),Qe=n(),E=i("p"),E.textContent=Ol,Ne=n(),u(B.$$.fragment),Fe=n(),u(Q.$$.fragment),Ye=n(),C=i("blockquote"),C.innerHTML=Kl,Se=n(),N=i("p"),N.innerHTML=et,ze=n(),F=i("p"),F.innerHTML=lt,Ae=n(),u(Y.$$.fragment),qe=n(),u(S.$$.fragment),Pe=n(),z=i("p"),z.innerHTML=tt,De=n(),A=i("p"),A.innerHTML=st,Oe=n(),u(q.$$.fragment),Ke=n(),P=i("p"),P.innerHTML=nt,el=n(),k=i("blockquote"),k.innerHTML=at,ll=n(),D=i("p"),D.innerHTML=pt,tl=n(),u(O.$$.fragment),sl=n(),K=i("p"),K.textContent=it,nl=n(),u(ee.$$.fragment),al=n(),v=i("blockquote"),v.innerHTML=ot,pl=n(),le=i("p"),le.innerHTML=rt,il=n(),te=i("p"),te.innerHTML=ft,ol=n(),u(se.$$.fragment),rl=n(),ne=i("p"),ne.innerHTML=ct,fl=n(),ae=i("p"),ae.innerHTML=mt,cl=n(),pe=i("p"),pe.textContent=ut,ml=n(),u(ie.$$.fragment),ul=n(),oe=i("p"),oe.innerHTML=dt,dl=n(),re=i("p"),re.innerHTML=ht,hl=n(),u(fe.$$.fragment),Ml=n(),ce=i("p"),ce.innerHTML=Mt,bl=n(),u(me.$$.fragment),Tl=n(),X=i("blockquote"),X.innerHTML=bt,wl=n(),ue=i("p"),ue.innerHTML=Tt,yl=n(),u(de.$$.fragment),Jl=n(),he=i("p"),he.innerHTML=wt,jl=n(),u(Me.$$.fragment),Ul=n(),be=i("p"),be.innerHTML=yt,$l=n(),u(Te.$$.fragment),Zl=n(),u(we.$$.fragment),Gl=n(),ye=i("p"),ye.innerHTML=Jt,gl=n(),Je=i("p"),Je.innerHTML=jt,_l=n(),R=i("blockquote"),R.innerHTML=Ut,Wl=n(),u(je.$$.fragment),xl=n(),Ue=i("p"),Ue.innerHTML=$t,Bl=n(),$e=i("p"),$e.innerHTML=Zt,Cl=n(),u(Ze.$$.fragment),kl=n(),Ge=i("p"),Ge.innerHTML=Gt,vl=n(),ge=i("p"),ge.innerHTML=gt,Xl=n(),I=i("blockquote"),I.innerHTML=_t,Rl=n(),_e=i("p"),_e.textContent=Wt,Il=n(),u(We.$$.fragment),Vl=n(),xe=i("p"),xe.innerHTML=xt,Ll=n(),u(Be.$$.fragment),Hl=n(),u(Ce.$$.fragment),El=n(),ke=i("blockquote"),ke.innerHTML=Bt,Ql=n(),ve=i("p"),ve.textContent=Ct,Nl=n(),u(Xe.$$.fragment),Fl=n(),u(Re.$$.fragment),Yl=n(),Ie=i("ul"),Ie.innerHTML=kt,Sl=n(),u(Ve.$$.fragment),zl=n(),Le=i("p"),this.h()},l(e){const l=Ht("svelte-u9bgzb",document.head);c=o(l,"META",{name:!0,content:!0}),l.forEach(t),U=a(e),y=o(e,"P",{}),Xt(y).forEach(t),m=a(e),d(J.$$.fragment,e),r=a(e),d(p.$$.fragment,e),j=a(e),$=o(e,"P",{"data-svelte-h":!0}),f($)!=="svelte-194j5uc"&&($.innerHTML=x),G=a(e),w=o(e,"P",{"data-svelte-h":!0}),f(w)!=="svelte-k2hqss"&&(w.textContent=W),He=a(e),L=o(e,"P",{"data-svelte-h":!0}),f(L)!=="svelte-x0c89o"&&(L.textContent=Dl),Ee=a(e),d(H.$$.fragment,e),Qe=a(e),E=o(e,"P",{"data-svelte-h":!0}),f(E)!=="svelte-py0rfh"&&(E.textContent=Ol),Ne=a(e),d(B.$$.fragment,e),Fe=a(e),d(Q.$$.fragment,e),Ye=a(e),C=o(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),f(C)!=="svelte-8dllu1"&&(C.innerHTML=Kl),Se=a(e),N=o(e,"P",{"data-svelte-h":!0}),f(N)!=="svelte-3w1pt5"&&(N.innerHTML=et),ze=a(e),F=o(e,"P",{"data-svelte-h":!0}),f(F)!=="svelte-qbrif4"&&(F.innerHTML=lt),Ae=a(e),d(Y.$$.fragment,e),qe=a(e),d(S.$$.fragment,e),Pe=a(e),z=o(e,"P",{"data-svelte-h":!0}),f(z)!=="svelte-1pe18dl"&&(z.innerHTML=tt),De=a(e),A=o(e,"P",{"data-svelte-h":!0}),f(A)!=="svelte-g36iym"&&(A.innerHTML=st),Oe=a(e),d(q.$$.fragment,e),Ke=a(e),P=o(e,"P",{"data-svelte-h":!0}),f(P)!=="svelte-1wsgro0"&&(P.innerHTML=nt),el=a(e),k=o(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),f(k)!=="svelte-1fkr80r"&&(k.innerHTML=at),ll=a(e),D=o(e,"P",{"data-svelte-h":!0}),f(D)!=="svelte-9ijocs"&&(D.innerHTML=pt),tl=a(e),d(O.$$.fragment,e),sl=a(e),K=o(e,"P",{"data-svelte-h":!0}),f(K)!=="svelte-15it70t"&&(K.textContent=it),nl=a(e),d(ee.$$.fragment,e),al=a(e),v=o(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),f(v)!=="svelte-16e1eoe"&&(v.innerHTML=ot),pl=a(e),le=o(e,"P",{"data-svelte-h":!0}),f(le)!=="svelte-1ahxi03"&&(le.innerHTML=rt),il=a(e),te=o(e,"P",{"data-svelte-h":!0}),f(te)!=="svelte-169dym5"&&(te.innerHTML=ft),ol=a(e),d(se.$$.fragment,e),rl=a(e),ne=o(e,"P",{"data-svelte-h":!0}),f(ne)!=="svelte-jmoihs"&&(ne.innerHTML=ct),fl=a(e),ae=o(e,"P",{"data-svelte-h":!0}),f(ae)!=="svelte-w2qqjo"&&(ae.innerHTML=mt),cl=a(e),pe=o(e,"P",{"data-svelte-h":!0}),f(pe)!=="svelte-dsv3x4"&&(pe.textContent=ut),ml=a(e),d(ie.$$.fragment,e),ul=a(e),oe=o(e,"P",{"data-svelte-h":!0}),f(oe)!=="svelte-1nyxskn"&&(oe.innerHTML=dt),dl=a(e),re=o(e,"P",{"data-svelte-h":!0}),f(re)!=="svelte-1qqd039"&&(re.innerHTML=ht),hl=a(e),d(fe.$$.fragment,e),Ml=a(e),ce=o(e,"P",{"data-svelte-h":!0}),f(ce)!=="svelte-wmi0r6"&&(ce.innerHTML=Mt),bl=a(e),d(me.$$.fragment,e),Tl=a(e),X=o(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),f(X)!=="svelte-1tna39"&&(X.innerHTML=bt),wl=a(e),ue=o(e,"P",{"data-svelte-h":!0}),f(ue)!=="svelte-6v6dgf"&&(ue.innerHTML=Tt),yl=a(e),d(de.$$.fragment,e),Jl=a(e),he=o(e,"P",{"data-svelte-h":!0}),f(he)!=="svelte-zlqqst"&&(he.innerHTML=wt),jl=a(e),d(Me.$$.fragment,e),Ul=a(e),be=o(e,"P",{"data-svelte-h":!0}),f(be)!=="svelte-qa92su"&&(be.innerHTML=yt),$l=a(e),d(Te.$$.fragment,e),Zl=a(e),d(we.$$.fragment,e),Gl=a(e),ye=o(e,"P",{"data-svelte-h":!0}),f(ye)!=="svelte-jzcid"&&(ye.innerHTML=Jt),gl=a(e),Je=o(e,"P",{"data-svelte-h":!0}),f(Je)!=="svelte-11fadgl"&&(Je.innerHTML=jt),_l=a(e),R=o(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),f(R)!=="svelte-yipkr8"&&(R.innerHTML=Ut),Wl=a(e),d(je.$$.fragment,e),xl=a(e),Ue=o(e,"P",{"data-svelte-h":!0}),f(Ue)!=="svelte-h31nps"&&(Ue.innerHTML=$t),Bl=a(e),$e=o(e,"P",{"data-svelte-h":!0}),f($e)!=="svelte-oh3qub"&&($e.innerHTML=Zt),Cl=a(e),d(Ze.$$.fragment,e),kl=a(e),Ge=o(e,"P",{"data-svelte-h":!0}),f(Ge)!=="svelte-1esyvy8"&&(Ge.innerHTML=Gt),vl=a(e),ge=o(e,"P",{"data-svelte-h":!0}),f(ge)!=="svelte-10dayln"&&(ge.innerHTML=gt),Xl=a(e),I=o(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),f(I)!=="svelte-1xrzhvm"&&(I.innerHTML=_t),Rl=a(e),_e=o(e,"P",{"data-svelte-h":!0}),f(_e)!=="svelte-19bhy1d"&&(_e.textContent=Wt),Il=a(e),d(We.$$.fragment,e),Vl=a(e),xe=o(e,"P",{"data-svelte-h":!0}),f(xe)!=="svelte-17j89ie"&&(xe.innerHTML=xt),Ll=a(e),d(Be.$$.fragment,e),Hl=a(e),d(Ce.$$.fragment,e),El=a(e),ke=o(e,"BLOCKQUOTE",{"data-svelte-h":!0}),f(ke)!=="svelte-tty6r3"&&(ke.innerHTML=Bt),Ql=a(e),ve=o(e,"P",{"data-svelte-h":!0}),f(ve)!=="svelte-qv5ldk"&&(ve.textContent=Ct),Nl=a(e),d(Xe.$$.fragment,e),Fl=a(e),d(Re.$$.fragment,e),Yl=a(e),Ie=o(e,"UL",{"data-svelte-h":!0}),f(Ie)!=="svelte-58n17n"&&(Ie.innerHTML=kt),Sl=a(e),d(Ve.$$.fragment,e),zl=a(e),Le=o(e,"P",{}),Xt(Le).forEach(t),this.h()},h(){V(c,"name","hf:doc:metadata"),V(c,"content",Pt),V(C,"class","tip"),V(k,"class","tip"),V(v,"class","tip"),V(X,"class","tip"),V(R,"class","tip"),V(I,"class","tip")},m(e,l){Et(document.head,c),s(e,U,l),s(e,y,l),s(e,m,l),h(J,e,l),s(e,r,l),h(p,e,l),s(e,j,l),s(e,$,l),s(e,G,l),s(e,w,l),s(e,He,l),s(e,L,l),s(e,Ee,l),h(H,e,l),s(e,Qe,l),s(e,E,l),s(e,Ne,l),h(B,e,l),s(e,Fe,l),h(Q,e,l),s(e,Ye,l),s(e,C,l),s(e,Se,l),s(e,N,l),s(e,ze,l),s(e,F,l),s(e,Ae,l),h(Y,e,l),s(e,qe,l),h(S,e,l),s(e,Pe,l),s(e,z,l),s(e,De,l),s(e,A,l),s(e,Oe,l),h(q,e,l),s(e,Ke,l),s(e,P,l),s(e,el,l),s(e,k,l),s(e,ll,l),s(e,D,l),s(e,tl,l),h(O,e,l),s(e,sl,l),s(e,K,l),s(e,nl,l),h(ee,e,l),s(e,al,l),s(e,v,l),s(e,pl,l),s(e,le,l),s(e,il,l),s(e,te,l),s(e,ol,l),h(se,e,l),s(e,rl,l),s(e,ne,l),s(e,fl,l),s(e,ae,l),s(e,cl,l),s(e,pe,l),s(e,ml,l),h(ie,e,l),s(e,ul,l),s(e,oe,l),s(e,dl,l),s(e,re,l),s(e,hl,l),h(fe,e,l),s(e,Ml,l),s(e,ce,l),s(e,bl,l),h(me,e,l),s(e,Tl,l),s(e,X,l),s(e,wl,l),s(e,ue,l),s(e,yl,l),h(de,e,l),s(e,Jl,l),s(e,he,l),s(e,jl,l),h(Me,e,l),s(e,Ul,l),s(e,be,l),s(e,$l,l),h(Te,e,l),s(e,Zl,l),h(we,e,l),s(e,Gl,l),s(e,ye,l),s(e,gl,l),s(e,Je,l),s(e,_l,l),s(e,R,l),s(e,Wl,l),h(je,e,l),s(e,xl,l),s(e,Ue,l),s(e,Bl,l),s(e,$e,l),s(e,Cl,l),h(Ze,e,l),s(e,kl,l),s(e,Ge,l),s(e,vl,l),s(e,ge,l),s(e,Xl,l),s(e,I,l),s(e,Rl,l),s(e,_e,l),s(e,Il,l),h(We,e,l),s(e,Vl,l),s(e,xe,l),s(e,Ll,l),h(Be,e,l),s(e,Hl,l),h(Ce,e,l),s(e,El,l),s(e,ke,l),s(e,Ql,l),s(e,ve,l),s(e,Nl,l),h(Xe,e,l),s(e,Fl,l),h(Re,e,l),s(e,Yl,l),s(e,Ie,l),s(e,Sl,l),h(Ve,e,l),s(e,zl,l),s(e,Le,l),Al=!0},p(e,[l]){const vt={};l&2&&(vt.$$scope={dirty:l,ctx:e}),B.$set(vt)},i(e){Al||(M(J.$$.fragment,e),M(p.$$.fragment,e),M(H.$$.fragment,e),M(B.$$.fragment,e),M(Q.$$.fragment,e),M(Y.$$.fragment,e),M(S.$$.fragment,e),M(q.$$.fragment,e),M(O.$$.fragment,e),M(ee.$$.fragment,e),M(se.$$.fragment,e),M(ie.$$.fragment,e),M(fe.$$.fragment,e),M(me.$$.fragment,e),M(de.$$.fragment,e),M(Me.$$.fragment,e),M(Te.$$.fragment,e),M(we.$$.fragment,e),M(je.$$.fragment,e),M(Ze.$$.fragment,e),M(We.$$.fragment,e),M(Be.$$.fragment,e),M(Ce.$$.fragment,e),M(Xe.$$.fragment,e),M(Re.$$.fragment,e),M(Ve.$$.fragment,e),Al=!0)},o(e){b(J.$$.fragment,e),b(p.$$.fragment,e),b(H.$$.fragment,e),b(B.$$.fragment,e),b(Q.$$.fragment,e),b(Y.$$.fragment,e),b(S.$$.fragment,e),b(q.$$.fragment,e),b(O.$$.fragment,e),b(ee.$$.fragment,e),b(se.$$.fragment,e),b(ie.$$.fragment,e),b(fe.$$.fragment,e),b(me.$$.fragment,e),b(de.$$.fragment,e),b(Me.$$.fragment,e),b(Te.$$.fragment,e),b(we.$$.fragment,e),b(je.$$.fragment,e),b(Ze.$$.fragment,e),b(We.$$.fragment,e),b(Be.$$.fragment,e),b(Ce.$$.fragment,e),b(Xe.$$.fragment,e),b(Re.$$.fragment,e),b(Ve.$$.fragment,e),Al=!1},d(e){e&&(t(U),t(y),t(m),t(r),t(j),t($),t(G),t(w),t(He),t(L),t(Ee),t(Qe),t(E),t(Ne),t(Fe),t(Ye),t(C),t(Se),t(N),t(ze),t(F),t(Ae),t(qe),t(Pe),t(z),t(De),t(A),t(Oe),t(Ke),t(P),t(el),t(k),t(ll),t(D),t(tl),t(sl),t(K),t(nl),t(al),t(v),t(pl),t(le),t(il),t(te),t(ol),t(rl),t(ne),t(fl),t(ae),t(cl),t(pe),t(ml),t(ul),t(oe),t(dl),t(re),t(hl),t(Ml),t(ce),t(bl),t(Tl),t(X),t(wl),t(ue),t(yl),t(Jl),t(he),t(jl),t(Ul),t(be),t($l),t(Zl),t(Gl),t(ye),t(gl),t(Je),t(_l),t(R),t(Wl),t(xl),t(Ue),t(Bl),t($e),t(Cl),t(kl),t(Ge),t(vl),t(ge),t(Xl),t(I),t(Rl),t(_e),t(Il),t(Vl),t(xe),t(Ll),t(Hl),t(El),t(ke),t(Ql),t(ve),t(Nl),t(Fl),t(Yl),t(Ie),t(Sl),t(zl),t(Le)),t(c),T(J,e),T(p,e),T(H,e),T(B,e),T(Q,e),T(Y,e),T(S,e),T(q,e),T(O,e),T(ee,e),T(se,e),T(ie,e),T(fe,e),T(me,e),T(de,e),T(Me,e),T(Te,e),T(we,e),T(je,e),T(Ze,e),T(We,e),T(Be,e),T(Ce,e),T(Xe,e),T(Re,e),T(Ve,e)}}}const Pt='{"title":"加速推理","local":"加速推理","sections":[{"title":"模型数据类型","local":"模型数据类型","sections":[],"depth":2},{"title":"缩放点积注意力","local":"缩放点积注意力","sections":[],"depth":2},{"title":"torch.compile","local":"torchcompile","sections":[{"title":"动态形状编译","local":"动态形状编译","sections":[],"depth":3},{"title":"区域编译","local":"区域编译","sections":[],"depth":3},{"title":"图中断","local":"图中断","sections":[],"depth":3},{"title":"GPU同步","local":"gpu同步","sections":[],"depth":3},{"title":"基准测试","local":"基准测试","sections":[],"depth":3}],"depth":2},{"title":"动态量化","local":"动态量化","sections":[],"depth":2},{"title":"融合投影矩阵","local":"融合投影矩阵","sections":[],"depth":2},{"title":"资源","local":"资源","sections":[],"depth":2}],"depth":1}';function Dt(_){return It(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class ss extends Vt{constructor(c){super(),Lt(this,c,Dt,qt,Rt,{})}}export{ss as component};

Xet Storage Details

Size:
43 kB
·
Xet hash:
5010122c83f3e2bb4b56a41f5c412fc8e5f179b48a23f28fe9de877ac3bb951d

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.