Buckets:
| import{s as We,o as Be,n as Se}from"../chunks/scheduler.b9285784.js";import{S as He,i as Xe,e as r,s as n,c,h as Ee,a as o,d as l,b as a,f as Le,g as d,j as m,k as Ae,l as Ie,m as s,n as f,t as h,o as u,p as $}from"../chunks/index.26bc89a1.js";import{T as Ye}from"../chunks/Tip.e4eba3d6.js";import{C as Ve,H as R,E as Re}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.7a0ae628.js";import{C as we}from"../chunks/CodeBlock.844ff9c3.js";function Fe(I){let i,g="In this example, there are two GPUs for “Multi-GPU” and a TPU pod with 8 workers";return{c(){i=r("p"),i.textContent=g},l(p){i=o(p,"P",{"data-svelte-h":!0}),m(i)!=="svelte-k9rq2f"&&(i.textContent=g)},m(p,b){s(p,i,b)},p:Se,d(p){p&&l(i)}}}function Ne(I){let i,g=`Since users can have their own learning rate schedulers defined, we leave this up to the user to decide if they wish to scale their | |
| learning rate or not.`;return{c(){i=r("p"),i.textContent=g},l(p){i=o(p,"P",{"data-svelte-h":!0}),m(i)!=="svelte-1birp9c"&&(i.textContent=g)},m(p,b){s(p,i,b)},p:Se,d(p){p&&l(i)}}}function Qe(I){let i,g,p,b,v,F,x,N,T,Me=`Evaluating and comparing the performance from different setups can be quite tricky if you don’t know what to look for. | |
| For example, you cannot run the same script with the same batch size across TPU, multi-GPU, and single-GPU with Accelerate | |
| and expect your results to line up.`,Q,_,ve="But why?",q,C,xe="There are three reasons for this that this tutorial will cover:",K,y,Te="<li><strong>Setting the right seeds</strong></li> <li><strong>Observed Batch Sizes</strong></li> <li><strong>Learning Rates</strong></li>",O,j,D,U,_e='While this issue has not come up as much, make sure to use <a href="/docs/accelerate/pr_4021/en/package_reference/utilities#accelerate.utils.set_seed">utils.set_seed()</a> to fully set the seed in all distributed cases so training will be reproducible:',ee,P,te,z,Ce="Why is this important? Under the hood this will set <strong>5</strong> different seed settings:",le,G,se,k,ye="The random state, numpy’s state, torch, torch’s device state, and if TPUs are available torch_xla’s cuda state.",ne,Z,ae,J,je=`When training with Accelerate, the batch size passed to the dataloader is the <strong>batch size per GPU</strong>. What this entails is | |
| a batch size of 64 on two GPUs is truly a batch size of 128. As a result, when testing on a single GPU this needs to be accounted for, | |
| as well as similarly for TPUs.`,ie,L,Ue="The below table can be used as a quick reference to try out different batch sizes:",re,w,oe,A,Pe="<thead><tr><th>Single GPU Batch Size</th> <th>Multi-GPU Equivalent Batch Size</th> <th>TPU Equivalent Batch Size</th></tr></thead> <tbody><tr><td>256</td> <td>128</td> <td>32</td></tr> <tr><td>128</td> <td>64</td> <td>16</td></tr> <tr><td>64</td> <td>32</td> <td>8</td></tr> <tr><td>32</td> <td>16</td> <td>4</td></tr></tbody>",pe,Y,me,S,ze=`As noted in multiple sources[<a href="https://aws.amazon.com/blogs/machine-learning/scalable-multi-node-deep-learning-training-using-gpus-in-the-aws-cloud/" rel="nofollow">1</a>][<a href="https://docs.nvidia.com/clara/clara-train-sdk/pt/model.html#classification-models-multi-gpu-training" rel="nofollow">2</a>], the learning rate should be scaled <em>linearly</em> based on the number of devices present. The below | |
| snippet shows doing so with Accelerate:`,ce,M,de,W,fe,B,Ge=`You will also find that <code>accelerate</code> will step the learning rate based on the number of processes being trained on. This is because | |
| of the observed batch size noted earlier. So in the case of 2 GPUs, the learning rate will be stepped twice as often as a single GPU | |
| to account for the batch size being twice as large (if no changes to the batch size on the single GPU instance are made).`,he,H,ue,X,ke=`When using gradient accumulation and mixed precision, due to how gradient averaging works (accumulation) and the precision loss (mixed precision), | |
| some degradation in performance is expected. This will be explicitly seen when comparing the batch-wise loss between different compute | |
| setups. However, the overall loss, metric, and general performance at the end of training should be <em>roughly</em> the same.`,$e,E,ge,V,be;return v=new Ve({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),x=new R({props:{title:"Comparing performance across distributed setups",local:"comparing-performance-across-distributed-setups",headingTag:"h1"}}),j=new R({props:{title:"Setting the Seed",local:"setting-the-seed",headingTag:"h2"}}),P=new we({props:{code:"ZnJvbSUyMGFjY2VsZXJhdGUudXRpbHMlMjBpbXBvcnQlMjBzZXRfc2VlZCUwQSUwQXNldF9zZWVkKDQyKQ==",highlighted:`<span class="hljs-keyword">from</span> accelerate.utils <span class="hljs-keyword">import</span> set_seed | |
| set_seed(<span class="hljs-number">42</span>)`,wrap:!1}}),G=new we({props:{code:"JTIwJTIwJTIwJTIwcmFuZG9tLnNlZWQoc2VlZCklMEElMjAlMjAlMjAlMjBucC5yYW5kb20uc2VlZChzZWVkKSUwQSUyMCUyMCUyMCUyMHRvcmNoLm1hbnVhbF9zZWVkKHNlZWQpJTBBJTIwJTIwJTIwJTIwdG9yY2guY3VkYS5tYW51YWxfc2VlZF9hbGwoc2VlZCklMjAlMjMlMjBvciUyMHRvcmNoLnhwdS5tYW51YWxfc2VlZF9hbGwlMkMlMjBldGMlMEElMjAlMjAlMjAlMjAlMjMlMjAlNUUlNUUlMjBzYWZlJTIwdG8lMjBjYWxsJTIwdGhpcyUyMGZ1bmN0aW9uJTIwZXZlbiUyMGlmJTIwY3VkYSUyMGlzJTIwbm90JTIwYXZhaWxhYmxlJTBBJTIwJTIwJTIwJTIwaWYlMjBpc190b3JjaF94bGFfYXZhaWxhYmxlKCklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB4bS5zZXRfcm5nX3N0YXRlKHNlZWQp",highlighted:` random.seed(seed) | |
| np.random.seed(seed) | |
| torch.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) <span class="hljs-comment"># or torch.xpu.manual_seed_all, etc</span> | |
| <span class="hljs-comment"># ^^ safe to call this function even if cuda is not available</span> | |
| <span class="hljs-keyword">if</span> is_torch_xla_available(): | |
| xm.set_rng_state(seed)`,wrap:!1}}),Z=new R({props:{title:"Observed Batch Sizes",local:"observed-batch-sizes",headingTag:"h2"}}),w=new Ye({props:{$$slots:{default:[Fe]},$$scope:{ctx:I}}}),Y=new R({props:{title:"Learning Rates",local:"learning-rates",headingTag:"h2"}}),M=new Ye({props:{$$slots:{default:[Ne]},$$scope:{ctx:I}}}),W=new we({props:{code:"bGVhcm5pbmdfcmF0ZSUyMCUzRCUyMDFlLTMlMEFhY2NlbGVyYXRvciUyMCUzRCUyMEFjY2VsZXJhdG9yKCklMEFsZWFybmluZ19yYXRlJTIwKiUzRCUyMGFjY2VsZXJhdG9yLm51bV9wcm9jZXNzZXMlMEElMEFvcHRpbWl6ZXIlMjAlM0QlMjBBZGFtVyhwYXJhbXMlM0Rtb2RlbC5wYXJhbWV0ZXJzKCklMkMlMjBsciUzRGxlYXJuaW5nX3JhdGUp",highlighted:`learning_rate = <span class="hljs-number">1e-3</span> | |
| accelerator = Accelerator() | |
| learning_rate *= accelerator.num_processes | |
| optimizer = AdamW(params=model.parameters(), lr=learning_rate)`,wrap:!1}}),H=new R({props:{title:"Gradient Accumulation and Mixed Precision",local:"gradient-accumulation-and-mixed-precision",headingTag:"h2"}}),E=new Re({props:{source:"https://github.com/huggingface/accelerate/blob/main/docs/source/concept_guides/performance.md"}}),{c(){i=r("meta"),g=n(),p=r("p"),b=n(),c(v.$$.fragment),F=n(),c(x.$$.fragment),N=n(),T=r("p"),T.textContent=Me,Q=n(),_=r("p"),_.textContent=ve,q=n(),C=r("p"),C.textContent=xe,K=n(),y=r("ol"),y.innerHTML=Te,O=n(),c(j.$$.fragment),D=n(),U=r("p"),U.innerHTML=_e,ee=n(),c(P.$$.fragment),te=n(),z=r("p"),z.innerHTML=Ce,le=n(),c(G.$$.fragment),se=n(),k=r("p"),k.textContent=ye,ne=n(),c(Z.$$.fragment),ae=n(),J=r("p"),J.innerHTML=je,ie=n(),L=r("p"),L.textContent=Ue,re=n(),c(w.$$.fragment),oe=n(),A=r("table"),A.innerHTML=Pe,pe=n(),c(Y.$$.fragment),me=n(),S=r("p"),S.innerHTML=ze,ce=n(),c(M.$$.fragment),de=n(),c(W.$$.fragment),fe=n(),B=r("p"),B.innerHTML=Ge,he=n(),c(H.$$.fragment),ue=n(),X=r("p"),X.innerHTML=ke,$e=n(),c(E.$$.fragment),ge=n(),V=r("p"),this.h()},l(e){const t=Ee("svelte-u9bgzb",document.head);i=o(t,"META",{name:!0,content:!0}),t.forEach(l),g=a(e),p=o(e,"P",{}),Le(p).forEach(l),b=a(e),d(v.$$.fragment,e),F=a(e),d(x.$$.fragment,e),N=a(e),T=o(e,"P",{"data-svelte-h":!0}),m(T)!=="svelte-1u8i7ng"&&(T.textContent=Me),Q=a(e),_=o(e,"P",{"data-svelte-h":!0}),m(_)!=="svelte-1pzk3n6"&&(_.textContent=ve),q=a(e),C=o(e,"P",{"data-svelte-h":!0}),m(C)!=="svelte-1px1vea"&&(C.textContent=xe),K=a(e),y=o(e,"OL",{"data-svelte-h":!0}),m(y)!=="svelte-1ezajja"&&(y.innerHTML=Te),O=a(e),d(j.$$.fragment,e),D=a(e),U=o(e,"P",{"data-svelte-h":!0}),m(U)!=="svelte-e1i2tj"&&(U.innerHTML=_e),ee=a(e),d(P.$$.fragment,e),te=a(e),z=o(e,"P",{"data-svelte-h":!0}),m(z)!=="svelte-bvqjor"&&(z.innerHTML=Ce),le=a(e),d(G.$$.fragment,e),se=a(e),k=o(e,"P",{"data-svelte-h":!0}),m(k)!=="svelte-19oel81"&&(k.textContent=ye),ne=a(e),d(Z.$$.fragment,e),ae=a(e),J=o(e,"P",{"data-svelte-h":!0}),m(J)!=="svelte-y1nchk"&&(J.innerHTML=je),ie=a(e),L=o(e,"P",{"data-svelte-h":!0}),m(L)!=="svelte-1pogoqw"&&(L.textContent=Ue),re=a(e),d(w.$$.fragment,e),oe=a(e),A=o(e,"TABLE",{"data-svelte-h":!0}),m(A)!=="svelte-4k3weo"&&(A.innerHTML=Pe),pe=a(e),d(Y.$$.fragment,e),me=a(e),S=o(e,"P",{"data-svelte-h":!0}),m(S)!=="svelte-1xakrqo"&&(S.innerHTML=ze),ce=a(e),d(M.$$.fragment,e),de=a(e),d(W.$$.fragment,e),fe=a(e),B=o(e,"P",{"data-svelte-h":!0}),m(B)!=="svelte-1z0alx"&&(B.innerHTML=Ge),he=a(e),d(H.$$.fragment,e),ue=a(e),X=o(e,"P",{"data-svelte-h":!0}),m(X)!=="svelte-1oucnsg"&&(X.innerHTML=ke),$e=a(e),d(E.$$.fragment,e),ge=a(e),V=o(e,"P",{}),Le(V).forEach(l),this.h()},h(){Ae(i,"name","hf:doc:metadata"),Ae(i,"content",qe)},m(e,t){Ie(document.head,i),s(e,g,t),s(e,p,t),s(e,b,t),f(v,e,t),s(e,F,t),f(x,e,t),s(e,N,t),s(e,T,t),s(e,Q,t),s(e,_,t),s(e,q,t),s(e,C,t),s(e,K,t),s(e,y,t),s(e,O,t),f(j,e,t),s(e,D,t),s(e,U,t),s(e,ee,t),f(P,e,t),s(e,te,t),s(e,z,t),s(e,le,t),f(G,e,t),s(e,se,t),s(e,k,t),s(e,ne,t),f(Z,e,t),s(e,ae,t),s(e,J,t),s(e,ie,t),s(e,L,t),s(e,re,t),f(w,e,t),s(e,oe,t),s(e,A,t),s(e,pe,t),f(Y,e,t),s(e,me,t),s(e,S,t),s(e,ce,t),f(M,e,t),s(e,de,t),f(W,e,t),s(e,fe,t),s(e,B,t),s(e,he,t),f(H,e,t),s(e,ue,t),s(e,X,t),s(e,$e,t),f(E,e,t),s(e,ge,t),s(e,V,t),be=!0},p(e,[t]){const Ze={};t&2&&(Ze.$$scope={dirty:t,ctx:e}),w.$set(Ze);const Je={};t&2&&(Je.$$scope={dirty:t,ctx:e}),M.$set(Je)},i(e){be||(h(v.$$.fragment,e),h(x.$$.fragment,e),h(j.$$.fragment,e),h(P.$$.fragment,e),h(G.$$.fragment,e),h(Z.$$.fragment,e),h(w.$$.fragment,e),h(Y.$$.fragment,e),h(M.$$.fragment,e),h(W.$$.fragment,e),h(H.$$.fragment,e),h(E.$$.fragment,e),be=!0)},o(e){u(v.$$.fragment,e),u(x.$$.fragment,e),u(j.$$.fragment,e),u(P.$$.fragment,e),u(G.$$.fragment,e),u(Z.$$.fragment,e),u(w.$$.fragment,e),u(Y.$$.fragment,e),u(M.$$.fragment,e),u(W.$$.fragment,e),u(H.$$.fragment,e),u(E.$$.fragment,e),be=!1},d(e){e&&(l(g),l(p),l(b),l(F),l(N),l(T),l(Q),l(_),l(q),l(C),l(K),l(y),l(O),l(D),l(U),l(ee),l(te),l(z),l(le),l(se),l(k),l(ne),l(ae),l(J),l(ie),l(L),l(re),l(oe),l(A),l(pe),l(me),l(S),l(ce),l(de),l(fe),l(B),l(he),l(ue),l(X),l($e),l(ge),l(V)),l(i),$(v,e),$(x,e),$(j,e),$(P,e),$(G,e),$(Z,e),$(w,e),$(Y,e),$(M,e),$(W,e),$(H,e),$(E,e)}}}const qe='{"title":"Comparing performance across distributed setups","local":"comparing-performance-across-distributed-setups","sections":[{"title":"Setting the Seed","local":"setting-the-seed","sections":[],"depth":2},{"title":"Observed Batch Sizes","local":"observed-batch-sizes","sections":[],"depth":2},{"title":"Learning Rates","local":"learning-rates","sections":[],"depth":2},{"title":"Gradient Accumulation and Mixed Precision","local":"gradient-accumulation-and-mixed-precision","sections":[],"depth":2}],"depth":1}';function Ke(I){return Be(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class st extends He{constructor(i){super(),Xe(this,i,Ke,Qe,We,{})}}export{st as component}; | |
Xet Storage Details
- Size:
- 11.4 kB
- Xet hash:
- 5d32d6f9e6112a0d4927883f252fb097eecbaf42448ab635bccc480f88f9a37b
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.