Buckets:
| import{s as bt,n as gt,o as Ot}from"../chunks/scheduler.5c93273d.js";import{S as wt,i as Ct,g as o,s as i,r as Q,A as $t,h as s,f as d,c as r,j as Tt,u as j,x as a,k as lt,y as Vt,a as l,v as H,d as J,t as L,w as R}from"../chunks/index.e43dd92b.js";import{C as it}from"../chunks/CodeBlock.6896320e.js";import{H as ht,E as vt}from"../chunks/getInferenceSnippets.22672bbf.js";function Qt(rt){let n,_,F,Z,p,B,m,ot='<a href="https://huggingface.co/papers/2303.17604" rel="nofollow">令牌合并</a>(ToMe)在基于 Transformer 的网络的前向传递中逐步合并冗余令牌/补丁,这可以加速 <code>StableDiffusionPipeline</code> 的推理延迟。',E,f,st="从 <code>pip</code> 安装 ToMe:",x,c,G,u,at='您可以使用 <a href="https://github.com/dbolya/tomesd" rel="nofollow"><code>tomesd</code></a> 库中的 <a href="https://github.com/dbolya/tomesd?tab=readme-ov-file#usage" rel="nofollow"><code>apply_patch</code></a> 函数:',P,U,A,y,nt='<code>apply_patch</code> 函数公开了多个<a href="https://github.com/dbolya/tomesd#usage" rel="nofollow">参数</a>,以帮助在管道推理速度和生成令牌的质量之间取得平衡。最重要的参数是 <code>ratio</code>,它控制在前向传递期间合并的令牌数量。',W,T,Mt='如<a href="https://huggingface.co/papers/2303.17604" rel="nofollow">论文</a>中所述,ToMe 可以在显著提升推理速度的同时,很大程度上保留生成图像的质量。通过增加 <code>ratio</code>,您可以进一步加速推理,但代价是图像质量有所下降。',k,h,pt='为了测试生成图像的质量,我们从 <a href="https://parti.research.google/" rel="nofollow">Parti Prompts</a> 中采样了一些提示,并使用 <code>StableDiffusionPipeline</code> 进行了推理,设置如下:',I,M,mt='<img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/tome/tome_samples.png"/>',z,b,ft='我们没有注意到生成样本的质量有任何显著下降,您可以在此 <a href="https://wandb.ai/sayakpaul/tomesd-results/runs/23j4bj3i?workspace=" rel="nofollow">WandB 报告</a>中查看生成的样本。如果您有兴趣重现此实验,请使用此<a href="https://gist.github.com/sayakpaul/8cac98d7f22399085a060992f411ecbd" rel="nofollow">脚本</a>。',N,g,Y,O,ct=`我们还在启用 <a href="https://huggingface.co/docs/diffusers/optimization/xformers" rel="nofollow">xFormers</a> 的情况下,对 <code>StableDiffusionPipeline</code> 上 <code>tomesd</code> 的影响进行了基准测试,涵盖了多个图像分辨率。结果 | |
| 结果是从以下开发环境中的A100和V100 GPU获得的:`,D,w,X,C,ut='要重现此基准测试,请随意使用此<a href="https://gist.github.com/sayakpaul/27aec6bca7eb7b0e0aa4112205850335" rel="nofollow">脚本</a>。结果以秒为单位报告,并且在适用的情况下,我们报告了使用ToMe和ToMe + xFormers时相对于原始管道的加速百分比。',q,$,Ut="<thead><tr><th><strong>GPU</strong></th> <th><strong>分辨率</strong></th> <th><strong>批处理大小</strong></th> <th><strong>原始</strong></th> <th><strong>ToMe</strong></th> <th><strong>ToMe + xFormers</strong></th></tr></thead> <tbody><tr><td><strong>A100</strong></td> <td>512</td> <td>10</td> <td>6.88</td> <td>5.26 (+23.55%)</td> <td>4.69 (+31.83%)</td></tr> <tr><td></td> <td>768</td> <td>10</td> <td>OOM</td> <td>14.71</td> <td>11</td></tr> <tr><td></td> <td></td> <td>8</td> <td>OOM</td> <td>11.56</td> <td>8.84</td></tr> <tr><td></td> <td></td> <td>4</td> <td>OOM</td> <td>5.98</td> <td>4.66</td></tr> <tr><td></td> <td></td> <td>2</td> <td>4.99</td> <td>3.24 (+35.07%)</td> <td>2.1 (+37.88%)</td></tr> <tr><td></td> <td></td> <td>1</td> <td>3.29</td> <td>2.24 (+31.91%)</td> <td>2.03 (+38.3%)</td></tr> <tr><td></td> <td>1024</td> <td>10</td> <td>OOM</td> <td>OOM</td> <td>OOM</td></tr> <tr><td></td> <td></td> <td>8</td> <td>OOM</td> <td>OOM</td> <td>OOM</td></tr> <tr><td></td> <td></td> <td>4</td> <td>OOM</td> <td>12.51</td> <td>9.09</td></tr> <tr><td></td> <td></td> <td>2</td> <td>OOM</td> <td>6.52</td> <td>4.96</td></tr> <tr><td></td> <td></td> <td>1</td> <td>6.4</td> <td>3.61 (+43.59%)</td> <td>2.81 (+56.09%)</td></tr> <tr><td><strong>V100</strong></td> <td>512</td> <td>10</td> <td>OOM</td> <td>10.03</td> <td>9.29</td></tr> <tr><td></td> <td></td> <td>8</td> <td>OOM</td> <td>8.05</td> <td>7.47</td></tr> <tr><td></td> <td></td> <td>4</td> <td>5.7</td> <td>4.3 (+24.56%)</td> <td>3.98 (+30.18%)</td></tr> <tr><td></td> <td></td> <td>2</td> <td>3.14</td> <td>2.43 (+22.61%)</td> <td>2.27 (+27.71%)</td></tr> <tr><td></td> <td></td> <td>1</td> <td>1.88</td> <td>1.57 (+16.49%)</td> <td>1.57 (+16.49%)</td></tr> <tr><td></td> <td>768</td> <td>10</td> <td>OOM</td> <td>OOM</td> <td>23.67</td></tr> <tr><td></td> <td></td> <td>8</td> <td>OOM</td> <td>OOM</td> <td>18.81</td></tr> <tr><td></td> <td></td> <td>4</td> <td>OOM</td> <td>11.81</td> <td>9.7</td></tr> <tr><td></td> <td></td> <td>2</td> <td>OOM</td> <td>6.27</td> <td>5.2</td></tr> <tr><td></td> <td></td> <td>1</td> <td>5.43</td> <td>3.38 (+37.75%)</td> <td>2.82 (+48.07%)</td></tr> <tr><td></td> <td>1024</td> <td>10</td> <td>OOM</td> <td></td> <td></td></tr></tbody>",K,V,yt='如上表所示,<code>tomesd</code> 带来的加速效果在更大的图像分辨率下变得更加明显。有趣的是,使用 <code>tomesd</code> 可以在更高分辨率如 1024x1024 上运行管道。您可能还可以通过 <a href="fp16#torchcompile"><code>torch.compile</code></a> 进一步加速推理。',tt,v,et,S,dt;return p=new ht({props:{title:"令牌合并",local:"令牌合并",headingTag:"h1"}}),c=new it({props:{code:"cGlwJTIwaW5zdGFsbCUyMHRvbWVzZA==",highlighted:"pip install tomesd",wrap:!1}}),U=new it({props:{code:"JTIwJTIwZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFN0YWJsZURpZmZ1c2lvblBpcGVsaW5lJTBBJTIwJTIwaW1wb3J0JTIwdG9yY2glMEElMjAlMjBpbXBvcnQlMjB0b21lc2QlMEElMEElMjAlMjBwaXBlbGluZSUyMCUzRCUyMFN0YWJsZURpZmZ1c2lvblBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjJzdGFibGUtZGlmZnVzaW9uLXYxLTUlMkZzdGFibGUtZGlmZnVzaW9uLXYxLTUlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMkMlMjB1c2Vfc2FmZXRlbnNvcnMlM0RUcnVlJTJDJTBBJTIwJTIwKS50byglMjJjdWRhJTIyKSUwQSUyQiUyMHRvbWVzZC5hcHBseV9wYXRjaChwaXBlbGluZSUyQyUyMHJhdGlvJTNEMC41KSUwQSUwQSUyMCUyMGltYWdlJTIwJTNEJTIwcGlwZWxpbmUoJTIyYSUyMHBob3RvJTIwb2YlMjBhbiUyMGFzdHJvbmF1dCUyMHJpZGluZyUyMGElMjBob3JzZSUyMG9uJTIwbWFycyUyMikuaW1hZ2VzJTVCMCU1RA==",highlighted:` from diffusers import StableDiffusionPipeline | |
| import torch | |
| import tomesd | |
| pipeline = StableDiffusionPipeline.from_pretrained( | |
| "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, | |
| ).to("cuda") | |
| <span class="hljs-addition">+ tomesd.apply_patch(pipeline, ratio=0.5)</span> | |
| image = pipeline("a photo of an astronaut riding a horse on mars").images[0]`,wrap:!1}}),g=new ht({props:{title:"基准测试",local:"基准测试",headingTag:"h2"}}),w=new it({props:{code:"LSUyMCU2MGRpZmZ1c2VycyU2MCUyMCVFNyU4OSU4OCVFNiU5QyVBQyVFRiVCQyU5QTAuMTUuMSUwQS0lMjBQeXRob24lMjAlRTclODklODglRTYlOUMlQUMlRUYlQkMlOUEzLjguMTYlMEEtJTIwUHlUb3JjaCUyMCVFNyU4OSU4OCVFNiU5QyVBQyVFRiVCQyU4OEdQVSVFRiVCQyU5RiVFRiVCQyU4OSVFRiVCQyU5QTEuMTMuMSUyQmN1MTE2JTIwKFRydWUpJTBBLSUyMEh1Z2dpbmdmYWNlX2h1YiUyMCVFNyU4OSU4OCVFNiU5QyVBQyVFRiVCQyU5QTAuMTMuMiUwQS0lMjBUcmFuc2Zvcm1lcnMlMjAlRTclODklODglRTYlOUMlQUMlRUYlQkMlOUE0LjI3LjIlMEEtJTIwQWNjZWxlcmF0ZSUyMCVFNyU4OSU4OCVFNiU5QyVBQyVFRiVCQyU5QTAuMTguMCUwQS0lMjB4Rm9ybWVycyUyMCVFNyU4OSU4OCVFNiU5QyVBQyVFRiVCQyU5QTAuMC4xNiUwQS0lMjB0b21lc2QlMjAlRTclODklODglRTYlOUMlQUMlRUYlQkMlOUEwLjEuMg==",highlighted:`- \`diffusers\` 版本:0.15.1 | |
| - Python 版本:3.8.16 | |
| - PyTorch 版本(GPU?):1.13.1+cu116 (True) | |
| - Huggingface_hub 版本:0.13.2 | |
| - Transformers 版本:4.27.2 | |
| - Accelerate 版本:0.18.0 | |
| - xFormers 版本:0.0.16 | |
| - tomesd 版本:0.1.2`,wrap:!1}}),v=new vt({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/zh/optimization/tome.md"}}),{c(){n=o("meta"),_=i(),F=o("p"),Z=i(),Q(p.$$.fragment),B=i(),m=o("p"),m.innerHTML=ot,E=i(),f=o("p"),f.innerHTML=st,x=i(),Q(c.$$.fragment),G=i(),u=o("p"),u.innerHTML=at,P=i(),Q(U.$$.fragment),A=i(),y=o("p"),y.innerHTML=nt,W=i(),T=o("p"),T.innerHTML=Mt,k=i(),h=o("p"),h.innerHTML=pt,I=i(),M=o("div"),M.innerHTML=mt,z=i(),b=o("p"),b.innerHTML=ft,N=i(),Q(g.$$.fragment),Y=i(),O=o("p"),O.innerHTML=ct,D=i(),Q(w.$$.fragment),X=i(),C=o("p"),C.innerHTML=ut,q=i(),$=o("table"),$.innerHTML=Ut,K=i(),V=o("p"),V.innerHTML=yt,tt=i(),Q(v.$$.fragment),et=i(),S=o("p"),this.h()},l(t){const e=$t("svelte-u9bgzb",document.head);n=s(e,"META",{name:!0,content:!0}),e.forEach(d),_=r(t),F=s(t,"P",{}),Tt(F).forEach(d),Z=r(t),j(p.$$.fragment,t),B=r(t),m=s(t,"P",{"data-svelte-h":!0}),a(m)!=="svelte-snw6ty"&&(m.innerHTML=ot),E=r(t),f=s(t,"P",{"data-svelte-h":!0}),a(f)!=="svelte-7kc1oz"&&(f.innerHTML=st),x=r(t),j(c.$$.fragment,t),G=r(t),u=s(t,"P",{"data-svelte-h":!0}),a(u)!=="svelte-s9hu2t"&&(u.innerHTML=at),P=r(t),j(U.$$.fragment,t),A=r(t),y=s(t,"P",{"data-svelte-h":!0}),a(y)!=="svelte-om2tu0"&&(y.innerHTML=nt),W=r(t),T=s(t,"P",{"data-svelte-h":!0}),a(T)!=="svelte-1dgeizi"&&(T.innerHTML=Mt),k=r(t),h=s(t,"P",{"data-svelte-h":!0}),a(h)!=="svelte-mpit8s"&&(h.innerHTML=pt),I=r(t),M=s(t,"DIV",{class:!0,"data-svelte-h":!0}),a(M)!=="svelte-ng3g1s"&&(M.innerHTML=mt),z=r(t),b=s(t,"P",{"data-svelte-h":!0}),a(b)!=="svelte-za0786"&&(b.innerHTML=ft),N=r(t),j(g.$$.fragment,t),Y=r(t),O=s(t,"P",{"data-svelte-h":!0}),a(O)!=="svelte-1glgh8p"&&(O.innerHTML=ct),D=r(t),j(w.$$.fragment,t),X=r(t),C=s(t,"P",{"data-svelte-h":!0}),a(C)!=="svelte-15l389v"&&(C.innerHTML=ut),q=r(t),$=s(t,"TABLE",{"data-svelte-h":!0}),a($)!=="svelte-9agb9s"&&($.innerHTML=Ut),K=r(t),V=s(t,"P",{"data-svelte-h":!0}),a(V)!=="svelte-1nf0h2i"&&(V.innerHTML=yt),tt=r(t),j(v.$$.fragment,t),et=r(t),S=s(t,"P",{}),Tt(S).forEach(d),this.h()},h(){lt(n,"name","hf:doc:metadata"),lt(n,"content",jt),lt(M,"class","flex justify-center")},m(t,e){Vt(document.head,n),l(t,_,e),l(t,F,e),l(t,Z,e),H(p,t,e),l(t,B,e),l(t,m,e),l(t,E,e),l(t,f,e),l(t,x,e),H(c,t,e),l(t,G,e),l(t,u,e),l(t,P,e),H(U,t,e),l(t,A,e),l(t,y,e),l(t,W,e),l(t,T,e),l(t,k,e),l(t,h,e),l(t,I,e),l(t,M,e),l(t,z,e),l(t,b,e),l(t,N,e),H(g,t,e),l(t,Y,e),l(t,O,e),l(t,D,e),H(w,t,e),l(t,X,e),l(t,C,e),l(t,q,e),l(t,$,e),l(t,K,e),l(t,V,e),l(t,tt,e),H(v,t,e),l(t,et,e),l(t,S,e),dt=!0},p:gt,i(t){dt||(J(p.$$.fragment,t),J(c.$$.fragment,t),J(U.$$.fragment,t),J(g.$$.fragment,t),J(w.$$.fragment,t),J(v.$$.fragment,t),dt=!0)},o(t){L(p.$$.fragment,t),L(c.$$.fragment,t),L(U.$$.fragment,t),L(g.$$.fragment,t),L(w.$$.fragment,t),L(v.$$.fragment,t),dt=!1},d(t){t&&(d(_),d(F),d(Z),d(B),d(m),d(E),d(f),d(x),d(G),d(u),d(P),d(A),d(y),d(W),d(T),d(k),d(h),d(I),d(M),d(z),d(b),d(N),d(Y),d(O),d(D),d(X),d(C),d(q),d($),d(K),d(V),d(tt),d(et),d(S)),d(n),R(p,t),R(c,t),R(U,t),R(g,t),R(w,t),R(v,t)}}}const jt='{"title":"令牌合并","local":"令牌合并","sections":[{"title":"基准测试","local":"基准测试","sections":[],"depth":2}],"depth":1}';function Ht(rt){return Ot(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class St extends wt{constructor(n){super(),Ct(this,n,Ht,Qt,bt,{})}}export{St as component}; | |
Xet Storage Details
- Size:
- 11.2 kB
- Xet hash:
- bf31cf5ea88a4a25841e10d79f9278ef0e0b3c466be7fe2e6b517821746717e3
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.