Buckets:

rtrm's picture
download
raw
59.9 kB
import{s as rs,o as ms,n as $}from"../chunks/scheduler.5c93273d.js";import{S as Js,i as us,g as u,s as r,r as Z,A as ys,h as y,f as e,c as m,j as cs,u as B,x as f,k as Rl,y as ds,a as p,v as _,d as I,t as C,w as g}from"../chunks/index.e43dd92b.js";import{T as Sl}from"../chunks/Tip.1cbfe904.js";import{C as Q}from"../chunks/CodeBlock.6896320e.js";import{H as Jl,E as Us}from"../chunks/getInferenceSnippets.7d64e4c6.js";import{H as Hl,a as x}from"../chunks/HfOption.d50154c3.js";function fs(w){let s,c="对于更快的上下文并行推理,请尝试使用支持 NVLink 的 NVIDIA A100 或 H100 GPU(如果可用),尤其是在 GPU 数量较多时。";return{c(){s=u("p"),s.textContent=c},l(a){s=y(a,"P",{"data-svelte-h":!0}),f(s)!=="svelte-sp9ouz"&&(s.textContent=c)},m(a,o){p(a,s,o)},p:$,d(a){a&&e(s)}}}function js(w){let s,c="要在 FLUX.1-dev 上应用第一块缓存,请调用 <code>apply_cache_on_pipe</code>,如下所示。0.08 是 FLUX 模型的默认残差差异值。",a,o,t,n,d='<thead><tr><th>优化</th> <th>原始</th> <th>FBCache rdt=0.06</th> <th>FBCache rdt=0.08</th> <th>FBCache rdt=0.10</th> <th>FBCache rdt=0.12</th></tr></thead> <tbody><tr><td>预览</td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-original.png" alt="Original"/></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.06.png" alt="FBCache rdt=0.06"/></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.08.png" alt="FBCache rdt=0.08"/></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.10.png" alt="FBCache rdt=0.10"/></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.12.png" alt="FBCache rdt=0.12"/></td></tr> <tr><td>墙时间 (s)</td> <td>26.36</td> <td>21.83</td> <td>17.01</td> <td>16.00</td> <td>13.78</td></tr></tbody>',j,J,b="First Block Cache 将推理速度降低到 17.01 秒,与基线相比,或快 1.55 倍,同时保持几乎零质量损失。",G;return o=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEZsdXhQaXBlbGluZSUwQSUwQXBpcGUlMjAlM0QlMjBGbHV4UGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMmJsYWNrLWZvcmVzdC1sYWJzJTJGRkxVWC4xLWRldiUyMiUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guYmZsb2F0MTYlMkMlMEEpLnRvKCUyMmN1ZGElMjIpJTBBJTBBZnJvbSUyMHBhcmFfYXR0bi5maXJzdF9ibG9ja19jYWNoZS5kaWZmdXNlcnNfYWRhcHRlcnMlMjBpbXBvcnQlMjBhcHBseV9jYWNoZV9vbl9waXBlJTBBJTBBYXBwbHlfY2FjaGVfb25fcGlwZShwaXBlJTJDJTIwcmVzaWR1YWxfZGlmZl90aHJlJTBBc2hvbGQlM0QwLjA4KSUwQSUwQSUyMyUyMCVFNSU5MCVBRiVFNyU5NCVBOCVFNSU4NiU4NSVFNSVBRCU5OCVFOCU4QSU4MiVFNyU5QyU4MSUwQSUyMyUyMHBpcGUuZW5hYmxlX21vZGVsX2NwdV9vZmZsb2FkKCklMEElMjMlMjBwaXBlLmVuYWJsZV9zZXF1ZW50aWFsX2NwdV9vZmZsb2FkKCklMEElMEFiZWdpbiUyMCUzRCUyMHRpbWUudGltZSgpJTBBaW1hZ2UlMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMCUyMkElMjBjYXQlMjBob2xkaW5nJTIwYSUyMHNpZ24lMjB0aGF0JTIwc2F5cyUyMGhlbGxvJTIwd29ybGQlMjIlMkMlMEElMjAlMjAlMjAlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMjglMkMlMEEpLmltYWdlcyU1QjAlNUQlMEFlbmQlMjAlM0QlMjB0aW1lLnRpbWUoKSUwQXByaW50KGYlMjJUaW1lJTNBJTIwJTdCZW5kJTIwLSUyMGJlZ2luJTNBLjJmJTdEcyUyMiklMEElMEFwcmludCglMjJTYXZpbmclMjBpbWFnZSUyMHRvJTIwZmx1eC5wbmclMjIpJTBBaW1hZ2Uuc2F2ZSglMjJmbHV4LnBuZyUyMik=",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> FluxPipeline
pipe = FluxPipeline.from_pretrained(
<span class="hljs-string">&quot;black-forest-labs/FLUX.1-dev&quot;</span>,
torch_dtype=torch.bfloat16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(pipe, residual_diff_thre
shold=<span class="hljs-number">0.08</span>)
<span class="hljs-comment"># 启用内存节省</span>
<span class="hljs-comment"># pipe.enable_model_cpu_offload()</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload()</span>
begin = time.time()
image = pipe(
<span class="hljs-string">&quot;A cat holding a sign that says hello world&quot;</span>,
num_inference_steps=<span class="hljs-number">28</span>,
).images[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Time: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Saving image to flux.png&quot;</span>)
image.save(<span class="hljs-string">&quot;flux.png&quot;</span>)`,wrap:!1}}),{c(){s=u("p"),s.innerHTML=c,a=r(),Z(o.$$.fragment),t=r(),n=u("table"),n.innerHTML=d,j=r(),J=u("p"),J.textContent=b},l(U){s=y(U,"P",{"data-svelte-h":!0}),f(s)!=="svelte-13o107o"&&(s.innerHTML=c),a=m(U),B(o.$$.fragment,U),t=m(U),n=y(U,"TABLE",{"data-svelte-h":!0}),f(n)!=="svelte-9gupjq"&&(n.innerHTML=d),j=m(U),J=y(U,"P",{"data-svelte-h":!0}),f(J)!=="svelte-59xgrt"&&(J.textContent=b)},m(U,W){p(U,s,W),p(U,a,W),_(o,U,W),p(U,t,W),p(U,n,W),p(U,j,W),p(U,J,W),G=!0},p:$,i(U){G||(I(o.$$.fragment,U),G=!0)},o(U){C(o.$$.fragment,U),G=!1},d(U){U&&(e(s),e(a),e(t),e(n),e(j),e(J)),g(o,U)}}}function hs(w){let s,c="要在 HunyuanVideo 上应用 First Block Cache,请使用 <code>apply_cache_on_pipe</code>,如下所示。0.06 是 HunyuanVideo 模型的默认残差差值。",a,o,t,n,d=`<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/hunyuan-video-original.mp4" type="video/mp4"/>
您的浏览器不支持视频标签。`,j,J,b="HunyuanVideo 无 FBCache",G,U,W=`<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/hunyuan-video-fbc.mp4" type="video/mp4"/>
Your browser does not support the video tag.`,M,T,k="HunyuanVideo 与 FBCache",Y,X,v="First Block Cache 将推理速度降低至 2271.06 秒,相比基线快了 1.62 倍,同时保持了几乎为零的质量损失。",z;return o=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEh1bnl1YW5WaWRlb1BpcGVsaW5lJTJDJTIwSHVueXVhblZpZGVvVHJhbnNmb3JtZXIzRE1vZGVsJTBBZnJvbSUyMGRpZmZ1c2Vycy51dGlscyUyMGltcG9ydCUyMGV4cG9ydF90b192aWRlbyUwQSUwQW1vZGVsX2lkJTIwJTNEJTIwJTIydGVuY2VudCUyRkh1bnl1YW5WaWRlbyUyMiUwQXRyYW5zZm9ybWVyJTIwJTNEJTIwSHVueXVhblZpZGVvVHJhbnNmb3JtZXIzRE1vZGVsLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjBtb2RlbF9pZCUyQyUwQSUyMCUyMCUyMCUyMHN1YmZvbGRlciUzRCUyMnRyYW5zZm9ybWVyJTIyJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUyQyUwQSUyMCUyMCUyMCUyMHJldmlzaW9uJTNEJTIycmVmcyUyRnByJTJGMTglMjIlMkMlMEEpJTBBcGlwZSUyMCUzRCUyMEh1bnl1YW5WaWRlb1BpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjBtb2RlbF9pZCUyQyUwQSUyMCUyMCUyMCUyMHRyYW5zZm9ybWVyJTNEdHJhbnNmb3JtZXIlMkMlMEElMjAlMjAlMjAlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMkMlMEElMjAlMjAlMjAlMjByZXZpc2lvbiUzRCUyMnJlZnMlMkZwciUyRjE4JTIyJTJDJTBBKS50byglMjJjdWRhJTIyKSUwQSUwQWZyb20lMjBwYXJhX2F0dG4uZmlyc3RfYmxvY2tfY2FjaGUuZGlmZnVzZXJzX2FkYXB0ZXJzJTIwaW1wb3J0JTIwYXBwbHlfY2FjaGVfb25fcGlwZSUwQSUwQWFwcGx5X2NhY2hlX29uX3BpcGUocGlwZSUyQyUyMHJlc2lkdWFsX2RpZmZfdGhyZXNob2xkJTNEMC42KSUwQSUwQXBpcGUudmFlLmVuYWJsZV90aWxpbmcoKSUwQSUwQWJlZ2luJTIwJTNEJTIwdGltZS50aW1lKCklMEFvdXRwdXQlMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMHByb21wdCUzRCUyMkElMjBjYXQlMjB3YWxrcyUyMG9uJTIwdGhlJTIwZ3Jhc3MlMkMlMjByZWFsaXN0aWMlMjIlMkMlMEElMjAlMjAlMjAlMjBoZWlnaHQlM0Q3MjAlMkMlMEElMjAlMjAlMjAlMjB3aWR0aCUzRDEyODAlMkMlMEElMjAlMjAlMjAlMjBudW1fZnJhbWVzJTNEMTI5JTJDJTBBJTIwJTIwJTIwJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDMwJTJDJTBBKS5mcmFtZXMlNUIwJTVEJTBBZW5kJTIwJTNEJTIwdGltZS50aW1lKCklMEFwcmludChmJTIyVGltZSUzQSUyMCU3QmVuZCUyMC0lMjBiZWdpbiUzQS4yZiU3RHMlMjIpJTBBJTBBcHJpbnQoJTIyU2F2aW5nJTIwdmlkZW8lMjB0byUyMGh1bnl1YW5fdmlkZW8ubXA0JTIyKSUwQWV4cG9ydF90b192aWRlbyhvdXRwdXQlMkMlMjAlMjJodW55dWFuX3ZpZGVvLm1wNCUyMiUyQyUyMGZwcyUzRDE1KQ==",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video
model_id = <span class="hljs-string">&quot;tencent/HunyuanVideo&quot;</span>
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
model_id,
subfolder=<span class="hljs-string">&quot;transformer&quot;</span>,
torch_dtype=torch.bfloat16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
)
pipe = HunyuanVideoPipeline.from_pretrained(
model_id,
transformer=transformer,
torch_dtype=torch.float16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(pipe, residual_diff_threshold=<span class="hljs-number">0.6</span>)
pipe.vae.enable_tiling()
begin = time.time()
output = pipe(
prompt=<span class="hljs-string">&quot;A cat walks on the grass, realistic&quot;</span>,
height=<span class="hljs-number">720</span>,
width=<span class="hljs-number">1280</span>,
num_frames=<span class="hljs-number">129</span>,
num_inference_steps=<span class="hljs-number">30</span>,
).frames[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Time: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Saving video to hunyuan_video.mp4&quot;</span>)
export_to_video(output, <span class="hljs-string">&quot;hunyuan_video.mp4&quot;</span>, fps=<span class="hljs-number">15</span>)`,wrap:!1}}),{c(){s=u("p"),s.innerHTML=c,a=r(),Z(o.$$.fragment),t=r(),n=u("video"),n.innerHTML=d,j=r(),J=u("small"),J.textContent=b,G=r(),U=u("video"),U.innerHTML=W,M=r(),T=u("small"),T.textContent=k,Y=r(),X=u("p"),X.textContent=v,this.h()},l(h){s=y(h,"P",{"data-svelte-h":!0}),f(s)!=="svelte-1enthpa"&&(s.innerHTML=c),a=m(h),B(o.$$.fragment,h),t=m(h),n=y(h,"VIDEO",{"data-svelte-h":!0}),f(n)!=="svelte-7ylzx3"&&(n.innerHTML=d),j=m(h),J=y(h,"SMALL",{"data-svelte-h":!0}),f(J)!=="svelte-1xtgkhf"&&(J.textContent=b),G=m(h),U=y(h,"VIDEO",{"data-svelte-h":!0}),f(U)!=="svelte-1lwel73"&&(U.innerHTML=W),M=m(h),T=y(h,"SMALL",{"data-svelte-h":!0}),f(T)!=="svelte-1omedp"&&(T.textContent=k),Y=m(h),X=y(h,"P",{"data-svelte-h":!0}),f(X)!=="svelte-b3qx2r"&&(X.textContent=v),this.h()},h(){n.controls="",U.controls=""},m(h,V){p(h,s,V),p(h,a,V),_(o,h,V),p(h,t,V),p(h,n,V),p(h,j,V),p(h,J,V),p(h,G,V),p(h,U,V),p(h,M,V),p(h,T,V),p(h,Y,V),p(h,X,V),z=!0},p:$,i(h){z||(I(o.$$.fragment,h),z=!0)},o(h){C(o.$$.fragment,h),z=!1},d(h){h&&(e(s),e(a),e(t),e(n),e(j),e(J),e(G),e(U),e(M),e(T),e(Y),e(X)),g(o,h)}}}function ws(w){let s,c,a,o;return s=new x({props:{id:"first-block-cache",option:"FLUX-1.dev",$$slots:{default:[js]},$$scope:{ctx:w}}}),a=new x({props:{id:"first-block-cache",option:"HunyuanVideo",$$slots:{default:[hs]},$$scope:{ctx:w}}}),{c(){Z(s.$$.fragment),c=r(),Z(a.$$.fragment)},l(t){B(s.$$.fragment,t),c=m(t),B(a.$$.fragment,t)},m(t,n){_(s,t,n),p(t,c,n),_(a,t,n),o=!0},p(t,n){const d={};n&2&&(d.$$scope={dirty:n,ctx:t}),s.$set(d);const j={};n&2&&(j.$$scope={dirty:n,ctx:t}),a.$set(j)},i(t){o||(I(s.$$.fragment,t),I(a.$$.fragment,t),o=!0)},o(t){C(s.$$.fragment,t),C(a.$$.fragment,t),o=!1},d(t){t&&e(c),g(s,t),g(a,t)}}}function Ts(w){let s,c="动态量化可能会显著改变模型输出的分布,因此您需要将 <code>residual_diff_threshold</code> 设置为更大的值以使其生效。";return{c(){s=u("p"),s.innerHTML=c},l(a){s=y(a,"P",{"data-svelte-h":!0}),f(s)!=="svelte-t23jb1"&&(s.innerHTML=c)},m(a,o){p(a,s,o)},p:$,d(a){a&&e(s)}}}function bs(w){let s,c,a,o="fp8 动态量化和 torch.compile 将推理速度降低至 7.56 秒,相比基线快了 3.48 倍。",t;return s=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEZsdXhQaXBlbGluZSUwQSUwQXBpcGUlMjAlM0QlMjBGbHV4UGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMmJsYWNrLWZvcmVzdC1sYWJzJTJGRkxVWC4xLWRldiUyMiUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guYmZsb2F0MTYlMkMlMEEpLnRvKCUyMmN1ZGElMjIpJTBBJTBBZnJvbSUyMHBhcmFfYXR0bi5maXJzdF9ibG9ja19jYWNoZS5kaWZmdXNlcnNfYWRhcHRlcnMlMjBpbXBvcnQlMjBhcHBseV9jYWNoZV9vbl9waXBlJTBBJTBBYXBwbHlfY2FjaGVfb25fcGlwZSglMEElMjAlMjAlMjAlMjBwaXBlJTJDJTBBJTIwJTIwJTIwJTIwcmVzaWR1YWxfZGlmZl90aHJlc2hvbGQlM0QwLjEyJTJDJTIwJTIwJTIzJTIwJUU0JUJEJUJGJUU3JTk0JUE4JUU2JTlCJUI0JUU1JUE0JUE3JUU3JTlBJTg0JUU1JTgwJUJDJUU0JUJCJUE1JUU0JUJEJUJGJUU3JUJDJTkzJUU1JUFEJTk4JUU3JTk0JTlGJUU2JTk1JTg4JTBBKSUwQSUwQWZyb20lMjB0b3JjaGFvLnF1YW50aXphdGlvbiUyMGltcG9ydCUyMHF1YW50aXplXyUyQyUyMGZsb2F0OF9keW5hbWljX2FjdGl2YXRpb25fZmxvYXQ4X3dlaWdodCUyQyUyMGZsb2F0OF93ZWlnaHRfb25seSUwQSUwQXF1YW50aXplXyhwaXBlLnRleHRfZW5jb2RlciUyQyUyMGZsb2F0OF93ZWlnaHRfb25seSgpKSUwQXF1YW50aXplXyhwaXBlLnRyYW5zZm9ybWVyJTJDJTIwZmxvYXQ4X2R5bmFtaWNfYWN0aXZhdGlvbl9mbG9hdDhfd2VpZ2h0KCkpJTBBcGlwZS50cmFuc2Zvcm1lciUyMCUzRCUyMHRvcmNoLmNvbXBpbGUoJTBBJTIwJTIwJTIwcGlwZS50cmFuc2Zvcm1lciUyQyUyMG1vZGUlM0QlMjJtYXgtYXV0b3R1bmUtbm8tY3VkYWdyYXBocyUyMiUyQyUwQSklMEElMEElMjMlMjAlRTUlOTAlQUYlRTclOTQlQTglRTUlODYlODUlRTUlQUQlOTglRTglOEElODIlRTclOUMlODElMEElMjMlMjBwaXBlLmVuYWJsZV9tb2RlbF9jcHVfb2ZmbG9hZCgpJTBBJTIzJTIwcGlwZS5lbmFibGVfc2VxdWVudGlhbF9jcHVfb2ZmbG9hZCgpJTBBJTBBZm9yJTIwaSUyMGluJTIwcmFuZ2UoMiklM0ElMEElMjAlMjAlMjAlMjBiZWdpbiUyMCUzRCUyMHRpbWUudGltZSgpJTBBJTIwJTIwJTIwJTIwaW1hZ2UlMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMkElMjBjYXQlMjBob2xkaW5nJTIwYSUyMHNpZ24lMjB0aGF0JTIwc2F5cyUyMGhlbGxvJTIwd29ybGQlMjIlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMjglMkMlMEElMjAlMjAlMjAlMjApLmltYWdlcyU1QjAlNUQlMEElMjAlMjAlMjAlMjBlbmQlMjAlM0QlMjB0aW1lLnRpbWUoKSUwQSUyMCUyMCUyMCUyMGlmJTIwaSUyMCUzRCUzRCUyMDAlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwcmludChmJTIyJUU5JUEyJTg0JUU3JTgzJUFEJUU2JTk3JUI2JUU5JTk3JUI0JTNBJTIwJTdCZW5kJTIwLSUyMGJlZ2luJTNBLjJmJTdEcyUyMiklMEElMjAlMjAlMjAlMjBlbHNlJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcHJpbnQoZiUyMiVFNiU5NyVCNiVFOSU5NyVCNCUzQSUyMCU3QmVuZCUyMC0lMjBiZWdpbiUzQS4yZiU3RHMlMjIpJTBBJTBBcHJpbnQoJTIyJUU0JUJGJTlEJUU1JUFEJTk4JUU1JTlCJUJFJUU1JTgzJThGJUU1JTg4JUIwJTIwZmx1eC5wbmclMjIpJTBBaW1hZ2Uuc2F2ZSglMjJmbHV4LnBuZyUyMik=",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> FluxPipeline
pipe = FluxPipeline.from_pretrained(
<span class="hljs-string">&quot;black-forest-labs/FLUX.1-dev&quot;</span>,
torch_dtype=torch.bfloat16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(
pipe,
residual_diff_threshold=<span class="hljs-number">0.12</span>, <span class="hljs-comment"># 使用更大的值以使缓存生效</span>
)
<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
quantize_(pipe.text_encoder, float8_weight_only())
quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
pipe.transformer = torch.<span class="hljs-built_in">compile</span>(
pipe.transformer, mode=<span class="hljs-string">&quot;max-autotune-no-cudagraphs&quot;</span>,
)
<span class="hljs-comment"># 启用内存节省</span>
<span class="hljs-comment"># pipe.enable_model_cpu_offload()</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload()</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">2</span>):
begin = time.time()
image = pipe(
<span class="hljs-string">&quot;A cat holding a sign that says hello world&quot;</span>,
num_inference_steps=<span class="hljs-number">28</span>,
).images[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;预热时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">else</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;保存图像到 flux.png&quot;</span>)
image.save(<span class="hljs-string">&quot;flux.png&quot;</span>)`,wrap:!1}}),{c(){Z(s.$$.fragment),c=r(),a=u("p"),a.textContent=o},l(n){B(s.$$.fragment,n),c=m(n),a=y(n,"P",{"data-svelte-h":!0}),f(a)!=="svelte-vnhv5y"&&(a.textContent=o)},m(n,d){_(s,n,d),p(n,c,d),p(n,a,d),t=!0},p:$,i(n){t||(I(s.$$.fragment,n),t=!0)},o(n){C(s.$$.fragment,n),t=!1},d(n){n&&(e(c),e(a)),g(s,n)}}}function Zs(w){let s,c,a,o="NVIDIA L20 GPU 仅有 48GB 内存,在编译后且如果未调用 <code>enable_model_cpu_offload</code> 时,可能会遇到内存不足(OOM)错误,因为 HunyuanVideo 在高分辨率和大量帧数运行时具有非常大的激活张量。对于内存少于 80GB 的 GPU,可以尝试降低分辨率和帧数来避免 OOM 错误。",t,n,d="大型视频生成模型通常受注意力计算而非全连接层的瓶颈限制。这些模型不会从量化和 torch.compile 中显著受益。",j;return s=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEh1bnl1YW5WaWRlb1BpcGVsaW5lJTJDJTIwSHVueXVhblZpZGVvVHJhbnNmb3JtZXIzRE1vZGVsJTBBZnJvbSUyMGRpZmZ1c2Vycy51dGlscyUyMGltcG9ydCUyMGV4cG9ydF90b192aWRlbyUwQSUwQW1vZGVsX2lkJTIwJTNEJTIwJTIydGVuY2VudCUyRkh1bnl1YW5WaWRlbyUyMiUwQXRyYW5zZm9ybWVyJTIwJTNEJTIwSHVueXVhblZpZGVvVHJhbnNmb3JtZXIzRE1vZGVsLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjBtb2RlbF9pZCUyQyUwQSUyMCUyMCUyMCUyMHN1YmZvbGRlciUzRCUyMnRyYW5zZm9ybWVyJTIyJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUyQyUwQSUyMCUyMCUyMCUyMHJldmlzaW9uJTNEJTIycmVmcyUyRnByJTJGMTglMjIlMkMlMEEpJTBBcGlwZSUyMCUzRCUyMEh1bnl1YW5WaWRlb1BpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjBtb2RlbF9pZCUyQyUwQSUyMCUyMCUyMCUyMHRyYW5zZm9ybWVyJTNEdHJhbnNmb3JtZXIlMkMlMEElMjAlMjAlMjAlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMkMlMEElMjAlMjAlMjAlMjByZXZpc2lvbiUzRCUyMnJlZnMlMkZwciUyRjE4JTIyJTJDJTBBKS50byglMjJjdWRhJTIyKSUwQSUwQWZyb20lMjBwYXJhX2F0dG4uZmlyc3RfYmxvY2tfY2FjaGUuZGlmZnVzZXJzX2FkYXB0ZXJzJTIwaW1wb3J0JTIwYXBwbHlfY2FjaGVfb25fcGlwZSUwQSUwQWFwcGx5X2NhY2hlX29uX3BpcGUocGlwZSklMEElMEFmcm9tJTIwdG9yY2hhby5xdWFudGl6YXRpb24lMjBpbXBvcnQlMjBxdWFudGl6ZV8lMkMlMjBmbG9hdDhfZHluYW1pY19hY3RpdmF0aW9uX2Zsb2F0OF93ZWlnaHQlMkMlMjBmbG9hdDhfd2VpZ2h0X29ubHklMEElMEFxdWFudGl6ZV8ocGlwZS50ZXh0X2VuY29kZXIlMkMlMjBmbG9hdDhfd2VpZ2h0X29ubHkoKSklMEFxdWFudGl6ZV8ocGlwZS50cmFuc2Zvcm1lciUyQyUyMGZsb2F0OF9keW5hbWljX2FjdGl2YXRpb25fZmxvYXQ4X3dlaWdodCgpKSUwQXBpcGUudHJhbnNmb3JtZXIlMjAlM0QlMjB0b3JjaC5jb21waWxlKCUwQSUyMCUyMCUyMHBpcGUudHJhbnNmb3JtZXIlMkMlMjBtb2RlJTNEJTIybWF4LWF1dG90dW5lLW5vLWN1ZGFncmFwaHMlMjIlMkMlMEEpJTBBJTBBJTIzJTIwRW5hYmxlJTIwbWVtb3J5JTIwc2F2aW5ncyUwQXBpcGUudmFlLmVuYWJsZV90aWxpbmcoKSUwQSUyMyUyMHBpcGUuZW5hYmxlX21vZGVsX2NwdV9vZmZsb2FkKCklMEElMjMlMjBwaXBlLmVuYWJsZV9zZXF1ZW50aWFsX2NwdV9vZmZsb2FkKCklMEElMEFmb3IlMjBpJTIwaW4lMjByYW5nZSgyKSUzQSUwQSUyMCUyMCUyMCUyMGJlZ2luJTIwJTNEJTIwdGltZS50aW1lKCklMEElMjAlMjAlMjAlMjBvdXRwdXQlMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHByb21wdCUzRCUyMkElMjBjYXQlMjB3YWxrcyUyMG9uJTIwdGhlJTIwZ3Jhc3MlMkMlMjByZWFsaXN0aWMlMjIlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBoZWlnaHQlM0Q3MjAlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB3aWR0aCUzRDEyODAlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBudW1fZnJhbWVzJTNEMTI5JTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDElMjBpZiUyMGklMjAlM0QlM0QlMjAwJTIwZWxzZSUyMDMwJTJDJTBBJTIwJTIwJTIwJTIwKS5mcmFtZXMlNUIwJTVEJTBBJTIwJTIwJTIwJTIwZW5kJTIwJTNEJTIwdGltZS50aW1lKCklMEElMjAlMjAlMjAlMjBpZiUyMGklMjAlM0QlM0QlMjAwJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcHJpbnQoZiUyMldhcm0lMjB1cCUyMHRpbWUlM0ElMjAlN0JlbmQlMjAtJTIwYmVnaW4lM0EuMmYlN0RzJTIyKSUwQSUyMCUyMCUyMCUyMGVsc2UlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwcmludChmJTIyVGltZSUzQSUyMCU3QmVuZCUyMC0lMjBiZWdpbiUzQS4yZiU3RHMlMjIpJTBBJTBBcHJpbnQoJTIyU2F2aW5nJTIwdmlkZW8lMjB0byUyMGh1bnl1YW5fdmlkZW8ubXA0JTIyKSUwQWV4cG9ydF90b192aWRlbyhvdXRwdXQlMkMlMjAlMjJodW55dWFuX3ZpZGVvLm1wNCUyMiUyQyUyMGZwcyUzRDE1KQ==",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video
model_id = <span class="hljs-string">&quot;tencent/HunyuanVideo&quot;</span>
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
model_id,
subfolder=<span class="hljs-string">&quot;transformer&quot;</span>,
torch_dtype=torch.bfloat16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
)
pipe = HunyuanVideoPipeline.from_pretrained(
model_id,
transformer=transformer,
torch_dtype=torch.float16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(pipe)
<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
quantize_(pipe.text_encoder, float8_weight_only())
quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
pipe.transformer = torch.<span class="hljs-built_in">compile</span>(
pipe.transformer, mode=<span class="hljs-string">&quot;max-autotune-no-cudagraphs&quot;</span>,
)
<span class="hljs-comment"># Enable memory savings</span>
pipe.vae.enable_tiling()
<span class="hljs-comment"># pipe.enable_model_cpu_offload()</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload()</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">2</span>):
begin = time.time()
output = pipe(
prompt=<span class="hljs-string">&quot;A cat walks on the grass, realistic&quot;</span>,
height=<span class="hljs-number">720</span>,
width=<span class="hljs-number">1280</span>,
num_frames=<span class="hljs-number">129</span>,
num_inference_steps=<span class="hljs-number">1</span> <span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-number">30</span>,
).frames[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Warm up time: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">else</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Time: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Saving video to hunyuan_video.mp4&quot;</span>)
export_to_video(output, <span class="hljs-string">&quot;hunyuan_video.mp4&quot;</span>, fps=<span class="hljs-number">15</span>)`,wrap:!1}}),{c(){Z(s.$$.fragment),c=r(),a=u("p"),a.innerHTML=o,t=r(),n=u("p"),n.textContent=d},l(J){B(s.$$.fragment,J),c=m(J),a=y(J,"P",{"data-svelte-h":!0}),f(a)!=="svelte-7u4mbb"&&(a.innerHTML=o),t=m(J),n=y(J,"P",{"data-svelte-h":!0}),f(n)!=="svelte-12a4nr6"&&(n.textContent=d)},m(J,b){_(s,J,b),p(J,c,b),p(J,a,b),p(J,t,b),p(J,n,b),j=!0},p:$,i(J){j||(I(s.$$.fragment,J),j=!0)},o(J){C(s.$$.fragment,J),j=!1},d(J){J&&(e(c),e(a),e(t),e(n)),g(s,J)}}}function Bs(w){let s,c,a,o;return s=new x({props:{id:"fp8-quantization",option:"FLUX-1.dev",$$slots:{default:[bs]},$$scope:{ctx:w}}}),a=new x({props:{id:"fp8-quantization",option:"HunyuanVideo",$$slots:{default:[Zs]},$$scope:{ctx:w}}}),{c(){Z(s.$$.fragment),c=r(),Z(a.$$.fragment)},l(t){B(s.$$.fragment,t),c=m(t),B(a.$$.fragment,t)},m(t,n){_(s,t,n),p(t,c,n),_(a,t,n),o=!0},p(t,n){const d={};n&2&&(d.$$scope={dirty:n,ctx:t}),s.$set(d);const j={};n&2&&(j.$$scope={dirty:n,ctx:t}),a.$set(j)},i(t){o||(I(s.$$.fragment,t),I(a.$$.fragment,t),o=!0)},o(t){C(s.$$.fragment,t),C(a.$$.fragment,t),o=!1},d(t){t&&e(c),g(s,t),g(a,t)}}}function _s(w){let s,c='请参考 <a href="https://github.com/chengzeyi/ParaAttention/tree/main" rel="nofollow">ParaAttention</a> 仓库获取详细说明和如何使用多个 GPU 扩展推理的示例。';return{c(){s=u("p"),s.innerHTML=c},l(a){s=y(a,"P",{"data-svelte-h":!0}),f(s)!=="svelte-36l4z9"&&(s.innerHTML=c)},m(a,o){p(a,s,o)},p:$,d(a){a&&e(s)}}}function Is(w){let s,c="以下代码示例结合了第一块缓存、fp8动态量化、torch.compile和上下文并行,以实现最快的推理速度。",a,o,t,n,d='保存到<code>run_flux.py</code>并使用<a href="https://pytorch.org/docs/stable/elastic/run.html" rel="nofollow">torchrun</a>启动。',j,J,b,G,U="推理速度降至8.20秒,相比基线快了3.21倍,使用2个NVIDIA L20 GPU。在4个L20上,推理速度为3.90秒,快了6.75倍。",W;return o=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBaW1wb3J0JTIwdG9yY2guZGlzdHJpYnV0ZWQlMjBhcyUyMGRpc3QlMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwRmx1eFBpcGVsaW5lJTBBJTBBZGlzdC5pbml0X3Byb2Nlc3NfZ3JvdXAoKSUwQSUwQXRvcmNoLmN1ZGEuc2V0X2RldmljZShkaXN0LmdldF9yYW5rKCkpJTBBJTBBcGlwZSUyMCUzRCUyMEZsdXhQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyYmxhY2stZm9yZXN0LWxhYnMlMkZGTFVYLjEtZGV2JTIyJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUyQyUwQSkudG8oJTIyY3VkYSUyMiklMEElMEFmcm9tJTIwcGFyYV9hdHRuLmNvbnRleHRfcGFyYWxsZWwlMjBpbXBvcnQlMjBpbml0X2NvbnRleHRfcGFyYWxsZWxfbWVzaCUwQWZyb20lMjBwYXJhX2F0dG4uY29udGV4dF9wYXJhbGxlbC5kaWZmdXNlcnNfYWRhcHRlcnMlMjBpbXBvcnQlMjBwYXJhbGxlbGl6ZV9waXBlJTBBZnJvbSUyMHBhcmFfYXR0bi5wYXJhbGxlbF92YWUuZGlmZnVzZXJzX2FkYXB0ZXJzJTIwaW1wb3J0JTIwcGFyYWxsZWxpemVfdmFlJTBBJTBBbWVzaCUyMCUzRCUyMGluaXRfY29udGV4dF9wYXJhbGxlbF9tZXNoKCUwQSUyMCUyMCUyMCUyMHBpcGUuZGV2aWNlLnR5cGUlMkMlMEElMjAlMjAlMjAlMjBtYXhfcmluZ19kaW1fc2l6ZSUzRDIlMkMlMEEpJTBBcGFyYWxsZWxpemVfcGlwZSglMEElMjAlMjAlMjAlMjBwaXBlJTJDJTBBJTIwJTIwJTIwJTIwbWVzaCUzRG1lc2glMkMlMEEpJTBBcGFyYWxsZWxpemVfdmFlKHBpcGUudmFlJTJDJTIwbWVzaCUzRG1lc2guX2ZsYXR0ZW4oKSklMEElMEFmcm9tJTIwcGFyYV9hdHRuLmZpcnN0X2Jsb2NrX2NhY2hlLmRpZmZ1c2Vyc19hZGFwdGVycyUyMGltcG9ydCUyMGFwcGx5X2NhY2hlX29uX3BpcGUlMEElMEFhcHBseV9jYWNoZV9vbl9waXBlKCUwQSUyMCUyMCUyMCUyMHBpcGUlMkMlMEElMjAlMjAlMjAlMjByZXNpZHVhbF9kaWZmX3RocmVzaG9sZCUzRDAuMTIlMkMlMjAlMjAlMjMlMjAlRTQlQkQlQkYlRTclOTQlQTglRTglQkUlODMlRTUlQTQlQTclRTclOUElODQlRTUlODAlQkMlRTQlQkIlQTUlRTQlQkQlQkYlRTclQkMlOTMlRTUlQUQlOTglRTclOTQlOUYlRTYlOTUlODglMEEpJTBBJTBBZnJvbSUyMHRvcmNoYW8ucXVhbnRpemF0aW9uJTIwaW1wb3J0JTIwcXVhbnRpemVfJTJDJTIwZmxvYXQ4X2R5bmFtaWNfYWN0aXZhdGlvbl9mbG9hdDhfd2VpZ2h0JTJDJTIwZmxvYXQ4X3dlaWdodF9vbmx5JTBBJTBBcXVhbnRpemVfKHBpcGUudGV4dF9lbmNvZGVyJTJDJTIwZmxvYXQ4X3dlaWdodF9vbmx5KCkpJTBBcXVhbnRpemVfKHBpcGUudHJhbnNmb3JtZXIlMkMlMjBmbG9hdDhfZHluYW1pY19hY3RpdmF0aW9uX2Zsb2F0OF93ZWlnaHQoKSklMEF0b3JjaC5faW5kdWN0b3IuY29uZmlnLnJlb3JkZXJfZm9yX2NvbXB1dGVfY29tbV9vdmVybGFwJTIwJTNEJTIwVHJ1ZSUwQXBpcGUudHJhbnNmb3JtZXIlMjAlM0QlMjB0b3JjaC5jb21waWxlKCUwQSUyMCUyMCUyMHBpcGUudHJhbnNmb3JtZXIlMkMlMjBtb2RlJTNEJTIybWF4LWF1dG90dW5lLW5vLWN1ZGFncmFwaHMlMjIlMkMlMEEpJTBBJTBBJTIzJTIwJUU1JTkwJUFGJUU3JTk0JUE4JUU1JTg2JTg1JUU1JUFEJTk4JUU4JThBJTgyJUU3JTlDJTgxJTBBJTIzJTIwcGlwZS5lbmFibGVfbW9kZWxfY3B1X29mZmxvYWQoZ3B1X2lkJTNEZGlzdC5nZXRfcmFuaygpKSUwQSUyMyUyMHBpcGUuZW5hYmxlX3NlcXVlbnRpYWxfY3B1X29mZmxvYWQoZ3B1X2lkJTNEZGlzdC5nZXRfcmFuaygpKSUwQSUwQWZvciUyMGklMjBpbiUyMHJhbmdlKDIpJTNBJTBBJTIwJTIwJTIwJTIwYmVnaW4lMjAlM0QlMjB0aW1lLnRpbWUoKSUwQSUyMCUyMCUyMCUyMGltYWdlJTIwJTNEJTIwcGlwZSglMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjJBJTIwY2F0JTIwaG9sZGluZyUyMGElMjBzaWduJTIwdGhhdCUyMHNheXMlMjBoZWxsbyUyMHdvcmxkJTIyJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDI4JTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwb3V0cHV0X3R5cGUlM0QlMjJwaWwlMjIlMjBpZiUyMGRpc3QuZ2V0X3JhbmsoKSUyMCUzRCUzRCUyMDAlMjBlbHNlJTIwJTIycHQlMjIlMkMlMEElMjAlMjAlMjAlMjApLmltYWdlcyU1QjAlNUQlMEElMjAlMjAlMjAlMjBlbmQlMjAlM0QlMjB0aW1lLnRpbWUoKSUwQSUyMCUyMCUyMCUyMGlmJTIwZGlzdC5nZXRfcmFuaygpJTIwJTNEJTNEJTIwMCUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwaSUyMCUzRCUzRCUyMDAlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwcmludChmJTIyJUU5JUEyJTg0JUU3JTgzJUFEJUU2JTk3JUI2JUU5JTk3JUI0JTNBJTIwJTdCZW5kJTIwLSUyMGJlZ2luJTNBLjJmJTdEcyUyMiklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBlbHNlJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcHJpbnQoZiUyMiVFNiU5NyVCNiVFOSU5NyVCNCUzQSUyMCU3QmVuZCUyMC0lMjBiZWdpbiUzQS4yZiU3RHMlMjIpJTBBJTBBaWYlMjBkaXN0LmdldF9yYW5rKCklMjAlM0QlM0QlMjAwJTNBJTBBJTIwJTIwJTIwJTIwcHJpbnQoJTIyJUU1JUIwJTg2JUU1JTlCJUJFJUU1JTgzJThGJUU0JUJGJTlEJUU1JUFEJTk4JUU1JTg4JUIwZmx1eC5wbmclMjIpJTBBJTIwJTIwJTIwJTIwaW1hZ2Uuc2F2ZSglMjJmbHV4LnBuZyUyMiklMEElMEFkaXN0LmRlc3Ryb3lfcHJvY2Vzc19ncm91cCgp",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">import</span> torch.distributed <span class="hljs-keyword">as</span> dist
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> FluxPipeline
dist.init_process_group()
torch.cuda.set_device(dist.get_rank())
pipe = FluxPipeline.from_pretrained(
<span class="hljs-string">&quot;black-forest-labs/FLUX.1-dev&quot;</span>,
torch_dtype=torch.bfloat16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.context_parallel <span class="hljs-keyword">import</span> init_context_parallel_mesh
<span class="hljs-keyword">from</span> para_attn.context_parallel.diffusers_adapters <span class="hljs-keyword">import</span> parallelize_pipe
<span class="hljs-keyword">from</span> para_attn.parallel_vae.diffusers_adapters <span class="hljs-keyword">import</span> parallelize_vae
mesh = init_context_parallel_mesh(
pipe.device.<span class="hljs-built_in">type</span>,
max_ring_dim_size=<span class="hljs-number">2</span>,
)
parallelize_pipe(
pipe,
mesh=mesh,
)
parallelize_vae(pipe.vae, mesh=mesh._flatten())
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(
pipe,
residual_diff_threshold=<span class="hljs-number">0.12</span>, <span class="hljs-comment"># 使用较大的值以使缓存生效</span>
)
<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
quantize_(pipe.text_encoder, float8_weight_only())
quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
torch._inductor.config.reorder_for_compute_comm_overlap = <span class="hljs-literal">True</span>
pipe.transformer = torch.<span class="hljs-built_in">compile</span>(
pipe.transformer, mode=<span class="hljs-string">&quot;max-autotune-no-cudagraphs&quot;</span>,
)
<span class="hljs-comment"># 启用内存节省</span>
<span class="hljs-comment"># pipe.enable_model_cpu_offload(gpu_id=dist.get_rank())</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload(gpu_id=dist.get_rank())</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">2</span>):
begin = time.time()
image = pipe(
<span class="hljs-string">&quot;A cat holding a sign that says hello world&quot;</span>,
num_inference_steps=<span class="hljs-number">28</span>,
output_type=<span class="hljs-string">&quot;pil&quot;</span> <span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-string">&quot;pt&quot;</span>,
).images[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span>:
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;预热时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">else</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;将图像保存到flux.png&quot;</span>)
image.save(<span class="hljs-string">&quot;flux.png&quot;</span>)
dist.destroy_process_group()`,wrap:!1}}),J=new Q({props:{code:"JTIzJTIwJUU0JUJEJUJGJUU3JTk0JUE4LS1ucHJvY19wZXJfbm9kZSVFNiU4QyU4NyVFNSVBRSU5QUdQVSVFNiU5NSVCMCVFOSU4NyU4RiUwQXRvcmNocnVuJTIwLS1ucHJvY19wZXJfbm9kZSUzRDIlMjBydW5fZmx1eC5weQ==",highlighted:`<span class="hljs-comment"># 使用--nproc_per_node指定GPU数量</span>
torchrun --nproc_per_node=2 run_flux.py`,wrap:!1}}),{c(){s=u("p"),s.textContent=c,a=r(),Z(o.$$.fragment),t=r(),n=u("p"),n.innerHTML=d,j=r(),Z(J.$$.fragment),b=r(),G=u("p"),G.textContent=U},l(M){s=y(M,"P",{"data-svelte-h":!0}),f(s)!=="svelte-1wcnxcs"&&(s.textContent=c),a=m(M),B(o.$$.fragment,M),t=m(M),n=y(M,"P",{"data-svelte-h":!0}),f(n)!=="svelte-bdq9oz"&&(n.innerHTML=d),j=m(M),B(J.$$.fragment,M),b=m(M),G=y(M,"P",{"data-svelte-h":!0}),f(G)!=="svelte-19q14lf"&&(G.textContent=U)},m(M,T){p(M,s,T),p(M,a,T),_(o,M,T),p(M,t,T),p(M,n,T),p(M,j,T),_(J,M,T),p(M,b,T),p(M,G,T),W=!0},p:$,i(M){W||(I(o.$$.fragment,M),I(J.$$.fragment,M),W=!0)},o(M){C(o.$$.fragment,M),C(J.$$.fragment,M),W=!1},d(M){M&&(e(s),e(a),e(t),e(n),e(j),e(b),e(G)),g(o,M),g(J,M)}}}function Cs(w){let s,c="以下代码示例结合了第一块缓存和上下文并行,以实现最快的推理速度。",a,o,t,n,d='保存到 <code>run_hunyuan_video.py</code> 并使用 <a href="https://pytorch.org/docs/stable/elastic/run.html" rel="nofollow">torchrun</a> 启动。',j,J,b,G,U="推理速度降低到 649.23 秒,相比基线快 5.66 倍,使用 8 个 NVIDIA L20 GPU。",W;return o=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBaW1wb3J0JTIwdG9yY2guZGlzdHJpYnV0ZWQlMjBhcyUyMGRpc3QlMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwSHVueXVhblZpZGVvUGlwZWxpbmUlMkMlMjBIdW55dWFuVmlkZW9UcmFuc2Zvcm1lcjNETW9kZWwlMEFmcm9tJTIwZGlmZnVzZXJzLnV0aWxzJTIwaW1wb3J0JTIwZXhwb3J0X3RvX3ZpZGVvJTBBJTBBZGlzdC5pbml0X3Byb2Nlc3NfZ3JvdXAoKSUwQSUwQXRvcmNoLmN1ZGEuc2V0X2RldmljZShkaXN0LmdldF9yYW5rKCkpJTBBJTBBbW9kZWxfaWQlMjAlM0QlMjAlMjJ0ZW5jZW50JTJGSHVueXVhblZpZGVvJTIyJTBBdHJhbnNmb3JtZXIlMjAlM0QlMjBIdW55dWFuVmlkZW9UcmFuc2Zvcm1lcjNETW9kZWwuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMG1vZGVsX2lkJTJDJTBBJTIwJTIwJTIwJTIwc3ViZm9sZGVyJTNEJTIydHJhbnNmb3JtZXIlMjIlMkMlMEElMjAlMjAlMjAlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmJmbG9hdDE2JTJDJTBBJTIwJTIwJTIwJTIwcmV2aXNpb24lM0QlMjJyZWZzJTJGcHIlMkYxOCUyMiUyQyUwQSklMEFwaXBlJTIwJTNEJTIwSHVueXVhblZpZGVvUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMG1vZGVsX2lkJTJDJTBBJTIwJTIwJTIwJTIwdHJhbnNmb3JtZXIlM0R0cmFuc2Zvcm1lciUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSUyMCUyMCUyMCUyMHJldmlzaW9uJTNEJTIycmVmcyUyRnByJTJGMTglMjIlMkMlMEEpLnRvKCUyMmN1ZGElMjIpJTBBJTBBZnJvbSUyMHBhcmFfYXR0bi5jb250ZXh0X3BhcmFsbGVsJTIwaW1wb3J0JTIwaW5pdF9jb250ZXh0X3BhcmFsbGVsX21lc2glMEFmcm9tJTIwcGFyYV9hdHRuLmNvbnRleHRfcGFyYWxsZWwuZGlmZnVzZXJzX2FkYXB0ZXJzJTIwaW1wb3J0JTIwcGFyYWxsZWxpemVfcGlwZSUwQWZyb20lMjBwYXJhX2F0dG4ucGFyYWxsZWxfdmFlLmRpZmZ1c2Vyc19hZGFwdGVycyUyMGltcG9ydCUyMHBhcmFsbGVsaXplX3ZhZSUwQSUwQW1lc2glMjAlM0QlMjBpbml0X2NvbnRleHRfcGFyYWxsZWxfbWVzaCglMEElMjAlMjAlMjAlMjBwaXBlLmRldmljZS50eXBlJTJDJTBBKSUwQXBhcmFsbGVsaXplX3BpcGUoJTBBJTIwJTIwJTIwJTIwcGlwZSUyQyUwQSUyMCUyMCUyMCUyMG1lc2glM0RtZXNoJTJDJTBBKSUwQXBhcmFsbGVsaXplX3ZhZShwaXBlLnZhZSUyQyUyMG1lc2glM0RtZXNoLl9mbGF0dGVuKCkpJTBBJTBBZnJvbSUyMHBhcmFfYXR0bi5maXJzdF9ibG9ja19jYWNoZS5kaWZmdXNlcnNfYWRhcHRlcnMlMjBpbXBvcnQlMjBhcHBseV9jYWNoZV9vbl9waXBlJTBBJTBBYXBwbHlfY2FjaGVfb25fcGlwZShwaXBlKSUwQSUwQSUyMyUyMGZyb20lMjB0b3JjaGFvLnF1YW50aXphdGlvbiUyMGltcG9ydCUyMHF1YW50aXplXyUyQyUyMGZsb2F0OF9keW5hbWljX2FjdGl2YXRpb25fZmxvYXQ4X3dlaWdodCUyQyUyMGZsb2F0OF93ZWlnaHRfb25seSUwQSUyMyUwQSUyMyUyMHRvcmNoLl9pbmR1Y3Rvci5jb25maWcucmVvcmRlcl9mb3JfY29tcHV0ZV9jb21tX292ZXJsYXAlMjAlM0QlMjBUcnVlJTBBJTIzJTBBJTIzJTIwcXVhbnRpemVfKHBpcGUudGV4dF9lbmNvZGVyJTJDJTIwZmxvYXQ4X3dlaWdodF9vbmx5KCkpJTBBJTIzJTIwcXVhbnRpemVfKHBpcGUudHJhbnNmb3JtZXIlMkMlMjBmbG9hdDhfZHluYW1pY19hY3RpdmF0aW9uX2Zsb2F0OF93ZWlnaHQoKSklMEElMjMlMjBwaXBlLnRyYW5zZm9ybWVyJTIwJTNEJTIwdG9yY2guY29tcGlsZSglMEElMjMlMjAlMjAlMjAlMjBwaXBlLnRyYW5zZm9ybWVyJTJDJTIwbW9kZSUzRCUyMm1heC1hdXRvdHVuZS1uby1jdWRhZ3JhcGhzJTIyJTJDJTBBJTIzJTIwKSUwQSUwQSUyMyUyMCVFNSU5MCVBRiVFNyU5NCVBOCVFNSU4NiU4NSVFNSVBRCU5OCVFOCU4QSU4MiVFNyU5QyU4MSUwQXBpcGUudmFlLmVuYWJsZV90aWxpbmcoKSUwQSUyMyUyMHBpcGUuZW5hYmxlX21vZGVsX2NwdV9vZmZsb2FkKGdwdV9pZCUzRGRpc3QuZ2V0X3JhbmsoKSklMEElMjMlMjBwaXBlLmVuYWJsZV9zZXF1ZW50aWFsX2NwdV9vZmZsb2FkKGdwdV9pZCUzRGRpc3QuZ2V0X3JhbmsoKSklMEElMEFmb3IlMjBpJTIwaW4lMjByYW5nZSgyKSUzQSUwQSUyMCUyMCUyMCUyMGJlZ2luJTIwJTNEJTIwdGltZS50aW1lKCklMEElMjAlMjAlMjAlMjBvdXRwdXQlMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHByb21wdCUzRCUyMkElMjBjYXQlMjB3YWxrcyUyMG9uJTIwdGhlJTIwZ3Jhc3MlMkMlMjByZWFsaXN0aWMlMjIlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBoZWlnaHQlM0Q3MjAlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB3aWR0aCUzRDEyODAlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBudW1fZnJhbWVzJTNEMTI5JTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDElMjBpZiUyMGklMjAlM0QlM0QlMjAwJTIwZWxzZSUyMDMwJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwb3V0cHV0X3R5cGUlM0QlMjJwaWwlMjIlMjBpZiUyMGRpc3QuZ2V0X3JhbmsoKSUyMCUzRCUzRCUyMDAlMjBlbHNlJTIwJTIycHQlMjIlMkMlMEElMjAlMjAlMjAlMjApLmZyYW1lcyU1QjAlNUQlMEElMjAlMjAlMjAlMjBlbmQlMjAlM0QlMjB0aW1lLnRpbWUoKSUwQSUyMCUyMCUyMCUyMGlmJTIwZGlzdC5nZXRfcmFuaygpJTIwJTNEJTNEJTIwMCUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwaSUyMCUzRCUzRCUyMDAlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwcmludChmJTIyJUU5JUEyJTg0JUU3JTgzJUFEJUU2JTk3JUI2JUU5JTk3JUI0JTNBJTIwJTdCZW5kJTIwLSUyMGJlZ2luJTNBLjJmJTdEcyUyMiklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBlbHNlJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcHJpbnQoZiUyMiVFNiU5NyVCNiVFOSU5NyVCNCUzQSUyMCU3QmVuZCUyMC0lMjBiZWdpbiUzQS4yZiU3RHMlMjIpJTBBJTBBaWYlMjBkaXN0LmdldF9yYW5rKCklMjAlM0QlM0QlMjAwJTNBJTBBJTIwJTIwJTIwJTIwcHJpbnQoJTIyJUU0JUJGJTlEJUU1JUFEJTk4JUU4JUE3JTg2JUU5JUEyJTkxJUU1JTg4JUIwJTIwaHVueXVhbl92aWRlby5tcDQlMjIpJTBBJTIwJTIwJTIwJTIwZXhwb3J0X3RvX3ZpZGVvKG91dHB1dCUyQyUyMCUyMmh1bnl1YW5fdmlkZW8ubXA0JTIyJTJDJTIwZnBzJTNEMTUpJTBBJTBBZGlzdC5kZXN0cm95X3Byb2Nlc3NfZ3JvdXAoKQ==",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">import</span> torch.distributed <span class="hljs-keyword">as</span> dist
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video
dist.init_process_group()
torch.cuda.set_device(dist.get_rank())
model_id = <span class="hljs-string">&quot;tencent/HunyuanVideo&quot;</span>
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
model_id,
subfolder=<span class="hljs-string">&quot;transformer&quot;</span>,
torch_dtype=torch.bfloat16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
)
pipe = HunyuanVideoPipeline.from_pretrained(
model_id,
transformer=transformer,
torch_dtype=torch.float16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.context_parallel <span class="hljs-keyword">import</span> init_context_parallel_mesh
<span class="hljs-keyword">from</span> para_attn.context_parallel.diffusers_adapters <span class="hljs-keyword">import</span> parallelize_pipe
<span class="hljs-keyword">from</span> para_attn.parallel_vae.diffusers_adapters <span class="hljs-keyword">import</span> parallelize_vae
mesh = init_context_parallel_mesh(
pipe.device.<span class="hljs-built_in">type</span>,
)
parallelize_pipe(
pipe,
mesh=mesh,
)
parallelize_vae(pipe.vae, mesh=mesh._flatten())
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(pipe)
<span class="hljs-comment"># from torchao.quantization import quantize_, float8_dynamic_activation_float8_weight, float8_weight_only</span>
<span class="hljs-comment">#</span>
<span class="hljs-comment"># torch._inductor.config.reorder_for_compute_comm_overlap = True</span>
<span class="hljs-comment">#</span>
<span class="hljs-comment"># quantize_(pipe.text_encoder, float8_weight_only())</span>
<span class="hljs-comment"># quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())</span>
<span class="hljs-comment"># pipe.transformer = torch.compile(</span>
<span class="hljs-comment"># pipe.transformer, mode=&quot;max-autotune-no-cudagraphs&quot;,</span>
<span class="hljs-comment"># )</span>
<span class="hljs-comment"># 启用内存节省</span>
pipe.vae.enable_tiling()
<span class="hljs-comment"># pipe.enable_model_cpu_offload(gpu_id=dist.get_rank())</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload(gpu_id=dist.get_rank())</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">2</span>):
begin = time.time()
output = pipe(
prompt=<span class="hljs-string">&quot;A cat walks on the grass, realistic&quot;</span>,
height=<span class="hljs-number">720</span>,
width=<span class="hljs-number">1280</span>,
num_frames=<span class="hljs-number">129</span>,
num_inference_steps=<span class="hljs-number">1</span> <span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-number">30</span>,
output_type=<span class="hljs-string">&quot;pil&quot;</span> <span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-string">&quot;pt&quot;</span>,
).frames[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span>:
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;预热时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">else</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;保存视频到 hunyuan_video.mp4&quot;</span>)
export_to_video(output, <span class="hljs-string">&quot;hunyuan_video.mp4&quot;</span>, fps=<span class="hljs-number">15</span>)
dist.destroy_process_group()`,wrap:!1}}),J=new Q({props:{code:"JTIzJTIwJUU0JUJEJUJGJUU3JTk0JUE4JTIwLS1ucHJvY19wZXJfbm9kZSUyMCVFNiU4QyU4NyVFNSVBRSU5QSUyMEdQVSUyMCVFNiU5NSVCMCVFOSU4NyU4RiUwQXRvcmNocnVuJTIwLS1ucHJvY19wZXJfbm9kZSUzRDglMjBydW5faHVueXVhbl92aWRlby5weQ==",highlighted:`<span class="hljs-comment"># 使用 --nproc_per_node 指定 GPU 数量</span>
torchrun --nproc_per_node=8 run_hunyuan_video.py`,wrap:!1}}),{c(){s=u("p"),s.textContent=c,a=r(),Z(o.$$.fragment),t=r(),n=u("p"),n.innerHTML=d,j=r(),Z(J.$$.fragment),b=r(),G=u("p"),G.textContent=U},l(M){s=y(M,"P",{"data-svelte-h":!0}),f(s)!=="svelte-58tdmv"&&(s.textContent=c),a=m(M),B(o.$$.fragment,M),t=m(M),n=y(M,"P",{"data-svelte-h":!0}),f(n)!=="svelte-pk6cu0"&&(n.innerHTML=d),j=m(M),B(J.$$.fragment,M),b=m(M),G=y(M,"P",{"data-svelte-h":!0}),f(G)!=="svelte-nfook2"&&(G.textContent=U)},m(M,T){p(M,s,T),p(M,a,T),_(o,M,T),p(M,t,T),p(M,n,T),p(M,j,T),_(J,M,T),p(M,b,T),p(M,G,T),W=!0},p:$,i(M){W||(I(o.$$.fragment,M),I(J.$$.fragment,M),W=!0)},o(M){C(o.$$.fragment,M),C(J.$$.fragment,M),W=!1},d(M){M&&(e(s),e(a),e(t),e(n),e(j),e(b),e(G)),g(o,M),g(J,M)}}}function gs(w){let s,c,a,o;return s=new x({props:{id:"context-parallelism",option:"FLUX-1.dev",$$slots:{default:[Is]},$$scope:{ctx:w}}}),a=new x({props:{id:"context-parallelism",option:"HunyuanVideo",$$slots:{default:[Cs]},$$scope:{ctx:w}}}),{c(){Z(s.$$.fragment),c=r(),Z(a.$$.fragment)},l(t){B(s.$$.fragment,t),c=m(t),B(a.$$.fragment,t)},m(t,n){_(s,t,n),p(t,c,n),_(a,t,n),o=!0},p(t,n){const d={};n&2&&(d.$$scope={dirty:n,ctx:t}),s.$set(d);const j={};n&2&&(j.$$scope={dirty:n,ctx:t}),a.$set(j)},i(t){o||(I(s.$$.fragment,t),I(a.$$.fragment,t),o=!0)},o(t){C(s.$$.fragment,t),C(a.$$.fragment,t),o=!1},d(t){t&&e(c),g(s,t),g(a,t)}}}function Gs(w){let s,c="<thead><tr><th>GPU 类型</th> <th>GPU 数量</th> <th>优化</th> <th>墙钟时间 (s)</th> <th>加速比</th></tr></thead> <tbody><tr><td>NVIDIA L20</td> <td>1</td> <td>基线</td> <td>26.36</td> <td>1.00x</td></tr> <tr><td>NVIDIA L20</td> <td>1</td> <td>FBCache (rdt=0.08)</td> <td>17.01</td> <td>1.55x</td></tr> <tr><td>NVIDIA L20</td> <td>1</td> <td>FP8 DQ</td> <td>13.40</td> <td>1.96x</td></tr> <tr><td>NVIDIA L20</td> <td>1</td> <td>FBCache (rdt=0.12) + FP8 DQ</td> <td>7.56</td> <td>3.48x</td></tr> <tr><td>NVIDIA L20</td> <td>2</td> <td>FBCache (rdt=0.12) + FP8 DQ + CP</td> <td>4.92</td> <td>5.35x</td></tr> <tr><td>NVIDIA L20</td> <td>4</td> <td>FBCache (rdt=0.12) + FP8 DQ + CP</td> <td>3.90</td> <td>6.75x</td></tr></tbody>";return{c(){s=u("table"),s.innerHTML=c},l(a){s=y(a,"TABLE",{"data-svelte-h":!0}),f(s)!=="svelte-qm3hdc"&&(s.innerHTML=c)},m(a,o){p(a,s,o)},p:$,d(a){a&&e(s)}}}function Ws(w){let s,c="<thead><tr><th>GPU 类型</th> <th>GPU 数量</th> <th>优化</th> <th>墙钟时间 (s)</th> <th>加速比</th></tr></thead> <tbody><tr><td>NVIDIA L20</td> <td>1</td> <td>基线</td> <td>3675.71</td> <td>1.00x</td></tr></tbody>",a,o,t=`| NVIDIA
L20 | 1 | FBCache | 2271.06 | 1.62x |
| NVIDIA L20 | 2 | FBCache + CP | 1132.90 | 3.24x |
| NVIDIA L20 | 4 | FBCache + CP | 718.15 | 5.12x |
| NVIDIA L20 | 8 | FBCache + CP | 649.23 | 5.66x |`;return{c(){s=u("table"),s.innerHTML=c,a=r(),o=u("p"),o.textContent=t},l(n){s=y(n,"TABLE",{"data-svelte-h":!0}),f(s)!=="svelte-g64hnd"&&(s.innerHTML=c),a=m(n),o=y(n,"P",{"data-svelte-h":!0}),f(o)!=="svelte-138nu2o"&&(o.textContent=t)},m(n,d){p(n,s,d),p(n,a,d),p(n,o,d)},p:$,d(n){n&&(e(s),e(a),e(o))}}}function Vs(w){let s,c,a,o;return s=new x({props:{id:"conclusion",option:"FLUX-1.dev",$$slots:{default:[Gs]},$$scope:{ctx:w}}}),a=new x({props:{id:"conclusion",option:"HunyuanVideo",$$slots:{default:[Ws]},$$scope:{ctx:w}}}),{c(){Z(s.$$.fragment),c=r(),Z(a.$$.fragment)},l(t){B(s.$$.fragment,t),c=m(t),B(a.$$.fragment,t)},m(t,n){_(s,t,n),p(t,c,n),_(a,t,n),o=!0},p(t,n){const d={};n&2&&(d.$$scope={dirty:n,ctx:t}),s.$set(d);const j={};n&2&&(j.$$scope={dirty:n,ctx:t}),a.$set(j)},i(t){o||(I(s.$$.fragment,t),I(a.$$.fragment,t),o=!0)},o(t){C(s.$$.fragment,t),C(a.$$.fragment,t),o=!1},d(t){t&&e(c),g(s,t),g(a,t)}}}function Xs(w){let s,c,a,o,t,n,d,j='<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-performance.png"/>',J,b,G='<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/hunyuan-video-performance.png"/>',U,W,M='大型图像和视频生成模型,如 <a href="https://huggingface.co/black-forest-labs/FLUX.1-dev" rel="nofollow">FLUX.1-dev</a> 和 <a href="https://huggingface.co/tencent/HunyuanVideo" rel="nofollow">HunyuanVideo</a>,由于其规模,可能对实时应用和部署构成推理挑战。',T,k,Y='<a href="https://github.com/chengzeyi/ParaAttention" rel="nofollow">ParaAttention</a> 是一个实现了<strong>上下文并行</strong>和<strong>第一块缓存</strong>的库,可以与其他技术(如 torch.compile、fp8 动态量化)结合使用,以加速推理。',X,v,z=`本指南将展示如何在 NVIDIA L20 GPU 上对 FLUX.1-dev 和 HunyuanVideo 应用 ParaAttention。
在我们的基线基准测试中,除了 HunyuanVideo 为避免内存不足错误外,未应用任何优化。`,h,V,Nl="我们的基线基准测试显示,FLUX.1-dev 能够在 28 步中生成 1024x1024 分辨率图像,耗时 26.36 秒;HunyuanVideo 能够在 30 步中生成 129 帧 720p 分辨率视频,耗时 3675.71 秒。",ul,A,yl,L,dl,q,xl="缓存模型中 transformer 块的输出并在后续推理步骤中重用它们,可以降低计算成本并加速推理。",Ul,D,Yl="然而,很难决定何时重用缓存以确保生成图像或视频的质量。ParaAttention 直接使用<strong>第一个 transformer 块输出的残差差异</strong>来近似模型输出之间的差异。当差异足够小时,重用先前推理步骤的残差差异。换句话说,跳过去噪步骤。",fl,K,zl="这在 FLUX.1-dev 和 HunyuanVideo 推理上实现了 2 倍加速,且质量非常好。",jl,P,Ll='<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/ada-cache.png" alt="Cache in Diffusion Transformer"/> <figcaption>AdaCache 的工作原理,第一块缓存是其变体</figcaption>',hl,E,wl,O,Tl,ll,ql='fp8 动态量化进一步加速推理并减少内存使用。为了使用 8 位 <a href="https://www.nvidia.com/en-us/data-center/tensor-cores/" rel="nofollow">NVIDIA Tensor Cores</a>,必须对激活和权重进行量化。',bl,sl,Dl="使用 <code>float8_weight_only</code> 和 <code>float8_dynamic_activation_float8_weight</code> 来量化文本编码器和变换器模型。",Zl,tl,Kl="默认量化方法是逐张量量化,但如果您的 GPU 支持逐行量化,您也可以尝试它以获得更好的准确性。",Bl,el,Pl='使用以下命令安装 <a href="https://github.com/pytorch/ao/tree/main" rel="nofollow">torchao</a>。',_l,al,Il,nl,Ol='<a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a> 使用 <code>mode=&quot;max-autotune-no-cudagraphs&quot;</code> 或 <code>mode=&quot;max-autotune&quot;</code> 选择最佳内核以获得性能。如果是第一次调用模型,编译可能会花费很长时间,但一旦模型编译完成,这是值得的。',Cl,pl,ls="此示例仅量化变换器模型,但您也可以量化文本编码器以进一步减少内存使用。",gl,F,Gl,R,Wl,il,Vl,ol,ss="上下文并行性并行化推理并随多个 GPU 扩展。ParaAttention 组合设计允许您将上下文并行性与第一块缓存和动态量化结合使用。",Xl,H,$l,Ml,ts='如果推理过程需要持久化和可服务,建议使用 <a href="https://pytorch.org/docs/stable/multiprocessing.html" rel="nofollow">torch.multiprocessing</a> 编写您自己的推理处理器。这可以消除启动进程以及加载和重新编译模型的开销。',kl,S,vl,cl,Ql,N,Al,rl,El,ml,Fl;return t=new Jl({props:{title:"ParaAttention",local:"paraattention",headingTag:"h1"}}),A=new Sl({props:{warning:!1,$$slots:{default:[fs]},$$scope:{ctx:w}}}),L=new Jl({props:{title:"第一块缓存",local:"第一块缓存",headingTag:"h2"}}),E=new Hl({props:{id:"first-block-cache",options:["FLUX-1.dev","HunyuanVideo"],$$slots:{default:[ws]},$$scope:{ctx:w}}}),O=new Jl({props:{title:"fp8 量化",local:"fp8-量化",headingTag:"h2"}}),al=new Q({props:{code:"cGlwMyUyMGluc3RhbGwlMjAtVSUyMHRvcmNoJTIwdG9yY2hhbw==",highlighted:"pip3 install -U torch torchao",wrap:!1}}),F=new Sl({props:{warning:!1,$$slots:{default:[Ts]},$$scope:{ctx:w}}}),R=new Hl({props:{id:"fp8-quantization",options:["FLUX-1.dev","HunyuanVideo"],$$slots:{default:[Bs]},$$scope:{ctx:w}}}),il=new Jl({props:{title:"上下文并行性",local:"上下文并行性",headingTag:"h2"}}),H=new Sl({props:{warning:!1,$$slots:{default:[_s]},$$scope:{ctx:w}}}),S=new Hl({props:{id:"context-parallelism",options:["FLUX-1.dev","HunyuanVideo"],$$slots:{default:[gs]},$$scope:{ctx:w}}}),cl=new Jl({props:{title:"基准测试",local:"基准测试",headingTag:"h2"}}),N=new Hl({props:{id:"conclusion",options:["FLUX-1.dev","HunyuanVideo"],$$slots:{default:[Vs]},$$scope:{ctx:w}}}),rl=new Us({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/zh/optimization/para_attn.md"}}),{c(){s=u("meta"),c=r(),a=u("p"),o=r(),Z(t.$$.fragment),n=r(),d=u("div"),d.innerHTML=j,J=r(),b=u("div"),b.innerHTML=G,U=r(),W=u("p"),W.innerHTML=M,T=r(),k=u("p"),k.innerHTML=Y,X=r(),v=u("p"),v.textContent=z,h=r(),V=u("p"),V.textContent=Nl,ul=r(),Z(A.$$.fragment),yl=r(),Z(L.$$.fragment),dl=r(),q=u("p"),q.textContent=xl,Ul=r(),D=u("p"),D.innerHTML=Yl,fl=r(),K=u("p"),K.textContent=zl,jl=r(),P=u("figure"),P.innerHTML=Ll,hl=r(),Z(E.$$.fragment),wl=r(),Z(O.$$.fragment),Tl=r(),ll=u("p"),ll.innerHTML=ql,bl=r(),sl=u("p"),sl.innerHTML=Dl,Zl=r(),tl=u("p"),tl.textContent=Kl,Bl=r(),el=u("p"),el.innerHTML=Pl,_l=r(),Z(al.$$.fragment),Il=r(),nl=u("p"),nl.innerHTML=Ol,Cl=r(),pl=u("p"),pl.textContent=ls,gl=r(),Z(F.$$.fragment),Gl=r(),Z(R.$$.fragment),Wl=r(),Z(il.$$.fragment),Vl=r(),ol=u("p"),ol.textContent=ss,Xl=r(),Z(H.$$.fragment),$l=r(),Ml=u("p"),Ml.innerHTML=ts,kl=r(),Z(S.$$.fragment),vl=r(),Z(cl.$$.fragment),Ql=r(),Z(N.$$.fragment),Al=r(),Z(rl.$$.fragment),El=r(),ml=u("p"),this.h()},l(l){const i=ys("svelte-u9bgzb",document.head);s=y(i,"META",{name:!0,content:!0}),i.forEach(e),c=m(l),a=y(l,"P",{}),cs(a).forEach(e),o=m(l),B(t.$$.fragment,l),n=m(l),d=y(l,"DIV",{class:!0,"data-svelte-h":!0}),f(d)!=="svelte-1p4slnk"&&(d.innerHTML=j),J=m(l),b=y(l,"DIV",{class:!0,"data-svelte-h":!0}),f(b)!=="svelte-1kqq4mt"&&(b.innerHTML=G),U=m(l),W=y(l,"P",{"data-svelte-h":!0}),f(W)!=="svelte-1aznnck"&&(W.innerHTML=M),T=m(l),k=y(l,"P",{"data-svelte-h":!0}),f(k)!=="svelte-nxzfeq"&&(k.innerHTML=Y),X=m(l),v=y(l,"P",{"data-svelte-h":!0}),f(v)!=="svelte-ckrkoc"&&(v.textContent=z),h=m(l),V=y(l,"P",{"data-svelte-h":!0}),f(V)!=="svelte-wz0gnx"&&(V.textContent=Nl),ul=m(l),B(A.$$.fragment,l),yl=m(l),B(L.$$.fragment,l),dl=m(l),q=y(l,"P",{"data-svelte-h":!0}),f(q)!=="svelte-17ussag"&&(q.textContent=xl),Ul=m(l),D=y(l,"P",{"data-svelte-h":!0}),f(D)!=="svelte-1dduett"&&(D.innerHTML=Yl),fl=m(l),K=y(l,"P",{"data-svelte-h":!0}),f(K)!=="svelte-1y9k7c"&&(K.textContent=zl),jl=m(l),P=y(l,"FIGURE",{"data-svelte-h":!0}),f(P)!=="svelte-1cjjxth"&&(P.innerHTML=Ll),hl=m(l),B(E.$$.fragment,l),wl=m(l),B(O.$$.fragment,l),Tl=m(l),ll=y(l,"P",{"data-svelte-h":!0}),f(ll)!=="svelte-iz99fc"&&(ll.innerHTML=ql),bl=m(l),sl=y(l,"P",{"data-svelte-h":!0}),f(sl)!=="svelte-3fzx9w"&&(sl.innerHTML=Dl),Zl=m(l),tl=y(l,"P",{"data-svelte-h":!0}),f(tl)!=="svelte-f45i0k"&&(tl.textContent=Kl),Bl=m(l),el=y(l,"P",{"data-svelte-h":!0}),f(el)!=="svelte-1s76vee"&&(el.innerHTML=Pl),_l=m(l),B(al.$$.fragment,l),Il=m(l),nl=y(l,"P",{"data-svelte-h":!0}),f(nl)!=="svelte-ubr0pd"&&(nl.innerHTML=Ol),Cl=m(l),pl=y(l,"P",{"data-svelte-h":!0}),f(pl)!=="svelte-1w4d2au"&&(pl.textContent=ls),gl=m(l),B(F.$$.fragment,l),Gl=m(l),B(R.$$.fragment,l),Wl=m(l),B(il.$$.fragment,l),Vl=m(l),ol=y(l,"P",{"data-svelte-h":!0}),f(ol)!=="svelte-125hian"&&(ol.textContent=ss),Xl=m(l),B(H.$$.fragment,l),$l=m(l),Ml=y(l,"P",{"data-svelte-h":!0}),f(Ml)!=="svelte-gwlw48"&&(Ml.innerHTML=ts),kl=m(l),B(S.$$.fragment,l),vl=m(l),B(cl.$$.fragment,l),Ql=m(l),B(N.$$.fragment,l),Al=m(l),B(rl.$$.fragment,l),El=m(l),ml=y(l,"P",{}),cs(ml).forEach(e),this.h()},h(){Rl(s,"name","hf:doc:metadata"),Rl(s,"content",$s),Rl(d,"class","flex justify-center"),Rl(b,"class","flex justify-center")},m(l,i){ds(document.head,s),p(l,c,i),p(l,a,i),p(l,o,i),_(t,l,i),p(l,n,i),p(l,d,i),p(l,J,i),p(l,b,i),p(l,U,i),p(l,W,i),p(l,T,i),p(l,k,i),p(l,X,i),p(l,v,i),p(l,h,i),p(l,V,i),p(l,ul,i),_(A,l,i),p(l,yl,i),_(L,l,i),p(l,dl,i),p(l,q,i),p(l,Ul,i),p(l,D,i),p(l,fl,i),p(l,K,i),p(l,jl,i),p(l,P,i),p(l,hl,i),_(E,l,i),p(l,wl,i),_(O,l,i),p(l,Tl,i),p(l,ll,i),p(l,bl,i),p(l,sl,i),p(l,Zl,i),p(l,tl,i),p(l,Bl,i),p(l,el,i),p(l,_l,i),_(al,l,i),p(l,Il,i),p(l,nl,i),p(l,Cl,i),p(l,pl,i),p(l,gl,i),_(F,l,i),p(l,Gl,i),_(R,l,i),p(l,Wl,i),_(il,l,i),p(l,Vl,i),p(l,ol,i),p(l,Xl,i),_(H,l,i),p(l,$l,i),p(l,Ml,i),p(l,kl,i),_(S,l,i),p(l,vl,i),_(cl,l,i),p(l,Ql,i),_(N,l,i),p(l,Al,i),_(rl,l,i),p(l,El,i),p(l,ml,i),Fl=!0},p(l,[i]){const es={};i&2&&(es.$$scope={dirty:i,ctx:l}),A.$set(es);const as={};i&2&&(as.$$scope={dirty:i,ctx:l}),E.$set(as);const ns={};i&2&&(ns.$$scope={dirty:i,ctx:l}),F.$set(ns);const ps={};i&2&&(ps.$$scope={dirty:i,ctx:l}),R.$set(ps);const is={};i&2&&(is.$$scope={dirty:i,ctx:l}),H.$set(is);const os={};i&2&&(os.$$scope={dirty:i,ctx:l}),S.$set(os);const Ms={};i&2&&(Ms.$$scope={dirty:i,ctx:l}),N.$set(Ms)},i(l){Fl||(I(t.$$.fragment,l),I(A.$$.fragment,l),I(L.$$.fragment,l),I(E.$$.fragment,l),I(O.$$.fragment,l),I(al.$$.fragment,l),I(F.$$.fragment,l),I(R.$$.fragment,l),I(il.$$.fragment,l),I(H.$$.fragment,l),I(S.$$.fragment,l),I(cl.$$.fragment,l),I(N.$$.fragment,l),I(rl.$$.fragment,l),Fl=!0)},o(l){C(t.$$.fragment,l),C(A.$$.fragment,l),C(L.$$.fragment,l),C(E.$$.fragment,l),C(O.$$.fragment,l),C(al.$$.fragment,l),C(F.$$.fragment,l),C(R.$$.fragment,l),C(il.$$.fragment,l),C(H.$$.fragment,l),C(S.$$.fragment,l),C(cl.$$.fragment,l),C(N.$$.fragment,l),C(rl.$$.fragment,l),Fl=!1},d(l){l&&(e(c),e(a),e(o),e(n),e(d),e(J),e(b),e(U),e(W),e(T),e(k),e(X),e(v),e(h),e(V),e(ul),e(yl),e(dl),e(q),e(Ul),e(D),e(fl),e(K),e(jl),e(P),e(hl),e(wl),e(Tl),e(ll),e(bl),e(sl),e(Zl),e(tl),e(Bl),e(el),e(_l),e(Il),e(nl),e(Cl),e(pl),e(gl),e(Gl),e(Wl),e(Vl),e(ol),e(Xl),e($l),e(Ml),e(kl),e(vl),e(Ql),e(Al),e(El),e(ml)),e(s),g(t,l),g(A,l),g(L,l),g(E,l),g(O,l),g(al,l),g(F,l),g(R,l),g(il,l),g(H,l),g(S,l),g(cl,l),g(N,l),g(rl,l)}}}const $s='{"title":"ParaAttention","local":"paraattention","sections":[{"title":"第一块缓存","local":"第一块缓存","sections":[],"depth":2},{"title":"fp8 量化","local":"fp8-量化","sections":[],"depth":2},{"title":"上下文并行性","local":"上下文并行性","sections":[],"depth":2},{"title":"基准测试","local":"基准测试","sections":[],"depth":2}],"depth":1}';function ks(w){return ms(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Hs extends Js{constructor(s){super(),us(this,s,ks,Xs,rs,{})}}export{Hs as component};

Xet Storage Details

Size:
59.9 kB
·
Xet hash:
6503381b213a7db3d1ed40dd256a5d1440bdf5d6a8c0c628754553d8e1c8e945

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.