Buckets:

rtrm's picture
download
raw
11.5 kB
import{s as Ot,n as $t,o as wt}from"../chunks/scheduler.e4ff9b64.js";import{S as Ct,i as Vt,e as o,s as i,c as j,h as vt,a as s,d,b as r,f as bt,g as L,j as n,k as rt,l as Qt,m as l,n as H,t as J,o as R,p as S}from"../chunks/index.09f1bca0.js";import{C as jt,H as gt,E as Lt}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.02b27692.js";import{C as ot}from"../chunks/CodeBlock.f59589c7.js";function Ht(st){let a,Z,F,x,p,B,m,E,f,nt='<a href="https://huggingface.co/papers/2303.17604" rel="nofollow">令牌合并</a>(ToMe)在基于 Transformer 的网络的前向传递中逐步合并冗余令牌/补丁,这可以加速 <code>StableDiffusionPipeline</code> 的推理延迟。',G,u,at="从 <code>pip</code> 安装 ToMe:",P,c,A,y,Mt='您可以使用 <a href="https://github.com/dbolya/tomesd" rel="nofollow"><code>tomesd</code></a> 库中的 <a href="https://github.com/dbolya/tomesd?tab=readme-ov-file#usage" rel="nofollow"><code>apply_patch</code></a> 函数:',W,U,k,T,pt='<code>apply_patch</code> 函数公开了多个<a href="https://github.com/dbolya/tomesd#usage" rel="nofollow">参数</a>,以帮助在管道推理速度和生成令牌的质量之间取得平衡。最重要的参数是 <code>ratio</code>,它控制在前向传递期间合并的令牌数量。',z,h,mt='如<a href="https://huggingface.co/papers/2303.17604" rel="nofollow">论文</a>中所述,ToMe 可以在显著提升推理速度的同时,很大程度上保留生成图像的质量。通过增加 <code>ratio</code>,您可以进一步加速推理,但代价是图像质量有所下降。',I,b,ft='为了测试生成图像的质量,我们从 <a href="https://parti.research.google/" rel="nofollow">Parti Prompts</a> 中采样了一些提示,并使用 <code>StableDiffusionPipeline</code> 进行了推理,设置如下:',N,M,ut='<img src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/tome/tome_samples.png"/>',Y,g,ct='我们没有注意到生成样本的质量有任何显著下降,您可以在此 <a href="https://wandb.ai/sayakpaul/tomesd-results/runs/23j4bj3i?workspace=" rel="nofollow">WandB 报告</a>中查看生成的样本。如果您有兴趣重现此实验,请使用此<a href="https://gist.github.com/sayakpaul/8cac98d7f22399085a060992f411ecbd" rel="nofollow">脚本</a>。',D,O,X,$,yt=`我们还在启用 <a href="https://huggingface.co/docs/diffusers/optimization/xformers" rel="nofollow">xFormers</a> 的情况下,对 <code>StableDiffusionPipeline</code> 上 <code>tomesd</code> 的影响进行了基准测试,涵盖了多个图像分辨率。结果
结果是从以下开发环境中的A100和V100 GPU获得的:`,q,w,K,C,Ut='要重现此基准测试,请随意使用此<a href="https://gist.github.com/sayakpaul/27aec6bca7eb7b0e0aa4112205850335" rel="nofollow">脚本</a>。结果以秒为单位报告,并且在适用的情况下,我们报告了使用ToMe和ToMe + xFormers时相对于原始管道的加速百分比。',tt,V,Tt="<thead><tr><th><strong>GPU</strong></th> <th><strong>分辨率</strong></th> <th><strong>批处理大小</strong></th> <th><strong>原始</strong></th> <th><strong>ToMe</strong></th> <th><strong>ToMe + xFormers</strong></th></tr></thead> <tbody><tr><td><strong>A100</strong></td> <td>512</td> <td>10</td> <td>6.88</td> <td>5.26 (+23.55%)</td> <td>4.69 (+31.83%)</td></tr> <tr><td></td> <td>768</td> <td>10</td> <td>OOM</td> <td>14.71</td> <td>11</td></tr> <tr><td></td> <td></td> <td>8</td> <td>OOM</td> <td>11.56</td> <td>8.84</td></tr> <tr><td></td> <td></td> <td>4</td> <td>OOM</td> <td>5.98</td> <td>4.66</td></tr> <tr><td></td> <td></td> <td>2</td> <td>4.99</td> <td>3.24 (+35.07%)</td> <td>2.1 (+37.88%)</td></tr> <tr><td></td> <td></td> <td>1</td> <td>3.29</td> <td>2.24 (+31.91%)</td> <td>2.03 (+38.3%)</td></tr> <tr><td></td> <td>1024</td> <td>10</td> <td>OOM</td> <td>OOM</td> <td>OOM</td></tr> <tr><td></td> <td></td> <td>8</td> <td>OOM</td> <td>OOM</td> <td>OOM</td></tr> <tr><td></td> <td></td> <td>4</td> <td>OOM</td> <td>12.51</td> <td>9.09</td></tr> <tr><td></td> <td></td> <td>2</td> <td>OOM</td> <td>6.52</td> <td>4.96</td></tr> <tr><td></td> <td></td> <td>1</td> <td>6.4</td> <td>3.61 (+43.59%)</td> <td>2.81 (+56.09%)</td></tr> <tr><td><strong>V100</strong></td> <td>512</td> <td>10</td> <td>OOM</td> <td>10.03</td> <td>9.29</td></tr> <tr><td></td> <td></td> <td>8</td> <td>OOM</td> <td>8.05</td> <td>7.47</td></tr> <tr><td></td> <td></td> <td>4</td> <td>5.7</td> <td>4.3 (+24.56%)</td> <td>3.98 (+30.18%)</td></tr> <tr><td></td> <td></td> <td>2</td> <td>3.14</td> <td>2.43 (+22.61%)</td> <td>2.27 (+27.71%)</td></tr> <tr><td></td> <td></td> <td>1</td> <td>1.88</td> <td>1.57 (+16.49%)</td> <td>1.57 (+16.49%)</td></tr> <tr><td></td> <td>768</td> <td>10</td> <td>OOM</td> <td>OOM</td> <td>23.67</td></tr> <tr><td></td> <td></td> <td>8</td> <td>OOM</td> <td>OOM</td> <td>18.81</td></tr> <tr><td></td> <td></td> <td>4</td> <td>OOM</td> <td>11.81</td> <td>9.7</td></tr> <tr><td></td> <td></td> <td>2</td> <td>OOM</td> <td>6.27</td> <td>5.2</td></tr> <tr><td></td> <td></td> <td>1</td> <td>5.43</td> <td>3.38 (+37.75%)</td> <td>2.82 (+48.07%)</td></tr> <tr><td></td> <td>1024</td> <td>10</td> <td>OOM</td> <td></td> <td></td></tr></tbody>",et,v,ht='如上表所示,<code>tomesd</code> 带来的加速效果在更大的图像分辨率下变得更加明显。有趣的是,使用 <code>tomesd</code> 可以在更高分辨率如 1024x1024 上运行管道。您可能还可以通过 <a href="fp16#torchcompile"><code>torch.compile</code></a> 进一步加速推理。',dt,Q,lt,_,it;return p=new jt({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),m=new gt({props:{title:"令牌合并",local:"令牌合并",headingTag:"h1"}}),c=new ot({props:{code:"cGlwJTIwaW5zdGFsbCUyMHRvbWVzZA==",highlighted:"pip install tomesd",wrap:!1}}),U=new ot({props:{code:"JTIwJTIwZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMFN0YWJsZURpZmZ1c2lvblBpcGVsaW5lJTBBJTIwJTIwaW1wb3J0JTIwdG9yY2glMEElMjAlMjBpbXBvcnQlMjB0b21lc2QlMEElMEElMjAlMjBwaXBlbGluZSUyMCUzRCUyMFN0YWJsZURpZmZ1c2lvblBpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjJzdGFibGUtZGlmZnVzaW9uLXYxLTUlMkZzdGFibGUtZGlmZnVzaW9uLXYxLTUlMjIlMkMlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMkMlMjB1c2Vfc2FmZXRlbnNvcnMlM0RUcnVlJTJDJTBBJTIwJTIwKS50byglMjJjdWRhJTIyKSUwQSUyQiUyMHRvbWVzZC5hcHBseV9wYXRjaChwaXBlbGluZSUyQyUyMHJhdGlvJTNEMC41KSUwQSUwQSUyMCUyMGltYWdlJTIwJTNEJTIwcGlwZWxpbmUoJTIyYSUyMHBob3RvJTIwb2YlMjBhbiUyMGFzdHJvbmF1dCUyMHJpZGluZyUyMGElMjBob3JzZSUyMG9uJTIwbWFycyUyMikuaW1hZ2VzJTVCMCU1RA==",highlighted:` from diffusers import StableDiffusionPipeline
import torch
import tomesd
pipeline = StableDiffusionPipeline.from_pretrained(
&quot;stable-diffusion-v1-5/stable-diffusion-v1-5&quot;, torch_dtype=torch.float16, use_safetensors=True,
).to(&quot;cuda&quot;)
<span class="hljs-addition">+ tomesd.apply_patch(pipeline, ratio=0.5)</span>
image = pipeline(&quot;a photo of an astronaut riding a horse on mars&quot;).images[0]`,wrap:!1}}),O=new gt({props:{title:"基准测试",local:"基准测试",headingTag:"h2"}}),w=new ot({props:{code:"LSUyMCU2MGRpZmZ1c2VycyU2MCUyMCVFNyU4OSU4OCVFNiU5QyVBQyVFRiVCQyU5QTAuMTUuMSUwQS0lMjBQeXRob24lMjAlRTclODklODglRTYlOUMlQUMlRUYlQkMlOUEzLjguMTYlMEEtJTIwUHlUb3JjaCUyMCVFNyU4OSU4OCVFNiU5QyVBQyVFRiVCQyU4OEdQVSVFRiVCQyU5RiVFRiVCQyU4OSVFRiVCQyU5QTEuMTMuMSUyQmN1MTE2JTIwKFRydWUpJTBBLSUyMEh1Z2dpbmdmYWNlX2h1YiUyMCVFNyU4OSU4OCVFNiU5QyVBQyVFRiVCQyU5QTAuMTMuMiUwQS0lMjBUcmFuc2Zvcm1lcnMlMjAlRTclODklODglRTYlOUMlQUMlRUYlQkMlOUE0LjI3LjIlMEEtJTIwQWNjZWxlcmF0ZSUyMCVFNyU4OSU4OCVFNiU5QyVBQyVFRiVCQyU5QTAuMTguMCUwQS0lMjB4Rm9ybWVycyUyMCVFNyU4OSU4OCVFNiU5QyVBQyVFRiVCQyU5QTAuMC4xNiUwQS0lMjB0b21lc2QlMjAlRTclODklODglRTYlOUMlQUMlRUYlQkMlOUEwLjEuMg==",highlighted:`- \`diffusers\` 版本:0.15.1
- Python 版本:3.8.16
- PyTorch 版本(GPU?):1.13.1+cu116 (True)
- Huggingface_hub 版本:0.13.2
- Transformers 版本:4.27.2
- Accelerate 版本:0.18.0
- xFormers 版本:0.0.16
- tomesd 版本:0.1.2`,wrap:!1}}),Q=new Lt({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/zh/optimization/tome.md"}}),{c(){a=o("meta"),Z=i(),F=o("p"),x=i(),j(p.$$.fragment),B=i(),j(m.$$.fragment),E=i(),f=o("p"),f.innerHTML=nt,G=i(),u=o("p"),u.innerHTML=at,P=i(),j(c.$$.fragment),A=i(),y=o("p"),y.innerHTML=Mt,W=i(),j(U.$$.fragment),k=i(),T=o("p"),T.innerHTML=pt,z=i(),h=o("p"),h.innerHTML=mt,I=i(),b=o("p"),b.innerHTML=ft,N=i(),M=o("div"),M.innerHTML=ut,Y=i(),g=o("p"),g.innerHTML=ct,D=i(),j(O.$$.fragment),X=i(),$=o("p"),$.innerHTML=yt,q=i(),j(w.$$.fragment),K=i(),C=o("p"),C.innerHTML=Ut,tt=i(),V=o("table"),V.innerHTML=Tt,et=i(),v=o("p"),v.innerHTML=ht,dt=i(),j(Q.$$.fragment),lt=i(),_=o("p"),this.h()},l(t){const e=vt("svelte-u9bgzb",document.head);a=s(e,"META",{name:!0,content:!0}),e.forEach(d),Z=r(t),F=s(t,"P",{}),bt(F).forEach(d),x=r(t),L(p.$$.fragment,t),B=r(t),L(m.$$.fragment,t),E=r(t),f=s(t,"P",{"data-svelte-h":!0}),n(f)!=="svelte-snw6ty"&&(f.innerHTML=nt),G=r(t),u=s(t,"P",{"data-svelte-h":!0}),n(u)!=="svelte-7kc1oz"&&(u.innerHTML=at),P=r(t),L(c.$$.fragment,t),A=r(t),y=s(t,"P",{"data-svelte-h":!0}),n(y)!=="svelte-s9hu2t"&&(y.innerHTML=Mt),W=r(t),L(U.$$.fragment,t),k=r(t),T=s(t,"P",{"data-svelte-h":!0}),n(T)!=="svelte-om2tu0"&&(T.innerHTML=pt),z=r(t),h=s(t,"P",{"data-svelte-h":!0}),n(h)!=="svelte-1dgeizi"&&(h.innerHTML=mt),I=r(t),b=s(t,"P",{"data-svelte-h":!0}),n(b)!=="svelte-mpit8s"&&(b.innerHTML=ft),N=r(t),M=s(t,"DIV",{class:!0,"data-svelte-h":!0}),n(M)!=="svelte-ng3g1s"&&(M.innerHTML=ut),Y=r(t),g=s(t,"P",{"data-svelte-h":!0}),n(g)!=="svelte-za0786"&&(g.innerHTML=ct),D=r(t),L(O.$$.fragment,t),X=r(t),$=s(t,"P",{"data-svelte-h":!0}),n($)!=="svelte-1glgh8p"&&($.innerHTML=yt),q=r(t),L(w.$$.fragment,t),K=r(t),C=s(t,"P",{"data-svelte-h":!0}),n(C)!=="svelte-15l389v"&&(C.innerHTML=Ut),tt=r(t),V=s(t,"TABLE",{"data-svelte-h":!0}),n(V)!=="svelte-9agb9s"&&(V.innerHTML=Tt),et=r(t),v=s(t,"P",{"data-svelte-h":!0}),n(v)!=="svelte-1nf0h2i"&&(v.innerHTML=ht),dt=r(t),L(Q.$$.fragment,t),lt=r(t),_=s(t,"P",{}),bt(_).forEach(d),this.h()},h(){rt(a,"name","hf:doc:metadata"),rt(a,"content",Jt),rt(M,"class","flex justify-center")},m(t,e){Qt(document.head,a),l(t,Z,e),l(t,F,e),l(t,x,e),H(p,t,e),l(t,B,e),H(m,t,e),l(t,E,e),l(t,f,e),l(t,G,e),l(t,u,e),l(t,P,e),H(c,t,e),l(t,A,e),l(t,y,e),l(t,W,e),H(U,t,e),l(t,k,e),l(t,T,e),l(t,z,e),l(t,h,e),l(t,I,e),l(t,b,e),l(t,N,e),l(t,M,e),l(t,Y,e),l(t,g,e),l(t,D,e),H(O,t,e),l(t,X,e),l(t,$,e),l(t,q,e),H(w,t,e),l(t,K,e),l(t,C,e),l(t,tt,e),l(t,V,e),l(t,et,e),l(t,v,e),l(t,dt,e),H(Q,t,e),l(t,lt,e),l(t,_,e),it=!0},p:$t,i(t){it||(J(p.$$.fragment,t),J(m.$$.fragment,t),J(c.$$.fragment,t),J(U.$$.fragment,t),J(O.$$.fragment,t),J(w.$$.fragment,t),J(Q.$$.fragment,t),it=!0)},o(t){R(p.$$.fragment,t),R(m.$$.fragment,t),R(c.$$.fragment,t),R(U.$$.fragment,t),R(O.$$.fragment,t),R(w.$$.fragment,t),R(Q.$$.fragment,t),it=!1},d(t){t&&(d(Z),d(F),d(x),d(B),d(E),d(f),d(G),d(u),d(P),d(A),d(y),d(W),d(k),d(T),d(z),d(h),d(I),d(b),d(N),d(M),d(Y),d(g),d(D),d(X),d($),d(q),d(K),d(C),d(tt),d(V),d(et),d(v),d(dt),d(lt),d(_)),d(a),S(p,t),S(m,t),S(c,t),S(U,t),S(O,t),S(w,t),S(Q,t)}}}const Jt='{"title":"令牌合并","local":"令牌合并","sections":[{"title":"基准测试","local":"基准测试","sections":[],"depth":2}],"depth":1}';function Rt(st){return wt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class xt extends Ct{constructor(a){super(),Vt(this,a,Rt,Ht,Ot,{})}}export{xt as component};

Xet Storage Details

Size:
11.5 kB
·
Xet hash:
94e90ec7c57d290326ff30d7c9d57c8df43f455504836ed092dac2ebcbc62c39

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.