Buckets:

HuggingFaceDocBuilder's picture
download
raw
11.4 kB
import{s as We,o as Be,n as Se}from"../chunks/scheduler.b9285784.js";import{S as He,i as Xe,e as r,s as n,c,h as Ee,a as o,d as l,b as a,f as Le,g as d,j as m,k as Ae,l as Ie,m as s,n as f,t as h,o as u,p as $}from"../chunks/index.26bc89a1.js";import{T as Ye}from"../chunks/Tip.e4eba3d6.js";import{C as Ve,H as R,E as Re}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.7a0ae628.js";import{C as we}from"../chunks/CodeBlock.844ff9c3.js";function Fe(I){let i,g="In this example, there are two GPUs for “Multi-GPU” and a TPU pod with 8 workers";return{c(){i=r("p"),i.textContent=g},l(p){i=o(p,"P",{"data-svelte-h":!0}),m(i)!=="svelte-k9rq2f"&&(i.textContent=g)},m(p,b){s(p,i,b)},p:Se,d(p){p&&l(i)}}}function Ne(I){let i,g=`Since users can have their own learning rate schedulers defined, we leave this up to the user to decide if they wish to scale their
learning rate or not.`;return{c(){i=r("p"),i.textContent=g},l(p){i=o(p,"P",{"data-svelte-h":!0}),m(i)!=="svelte-1birp9c"&&(i.textContent=g)},m(p,b){s(p,i,b)},p:Se,d(p){p&&l(i)}}}function Qe(I){let i,g,p,b,v,F,x,N,T,Me=`Evaluating and comparing the performance from different setups can be quite tricky if you don’t know what to look for.
For example, you cannot run the same script with the same batch size across TPU, multi-GPU, and single-GPU with Accelerate
and expect your results to line up.`,Q,_,ve="But why?",q,C,xe="There are three reasons for this that this tutorial will cover:",K,y,Te="<li><strong>Setting the right seeds</strong></li> <li><strong>Observed Batch Sizes</strong></li> <li><strong>Learning Rates</strong></li>",O,j,D,U,_e='While this issue has not come up as much, make sure to use <a href="/docs/accelerate/pr_4021/en/package_reference/utilities#accelerate.utils.set_seed">utils.set_seed()</a> to fully set the seed in all distributed cases so training will be reproducible:',ee,P,te,z,Ce="Why is this important? Under the hood this will set <strong>5</strong> different seed settings:",le,G,se,k,ye="The random state, numpy’s state, torch, torch’s device state, and if TPUs are available torch_xla’s cuda state.",ne,Z,ae,J,je=`When training with Accelerate, the batch size passed to the dataloader is the <strong>batch size per GPU</strong>. What this entails is
a batch size of 64 on two GPUs is truly a batch size of 128. As a result, when testing on a single GPU this needs to be accounted for,
as well as similarly for TPUs.`,ie,L,Ue="The below table can be used as a quick reference to try out different batch sizes:",re,w,oe,A,Pe="<thead><tr><th>Single GPU Batch Size</th> <th>Multi-GPU Equivalent Batch Size</th> <th>TPU Equivalent Batch Size</th></tr></thead> <tbody><tr><td>256</td> <td>128</td> <td>32</td></tr> <tr><td>128</td> <td>64</td> <td>16</td></tr> <tr><td>64</td> <td>32</td> <td>8</td></tr> <tr><td>32</td> <td>16</td> <td>4</td></tr></tbody>",pe,Y,me,S,ze=`As noted in multiple sources[<a href="https://aws.amazon.com/blogs/machine-learning/scalable-multi-node-deep-learning-training-using-gpus-in-the-aws-cloud/" rel="nofollow">1</a>][<a href="https://docs.nvidia.com/clara/clara-train-sdk/pt/model.html#classification-models-multi-gpu-training" rel="nofollow">2</a>], the learning rate should be scaled <em>linearly</em> based on the number of devices present. The below
snippet shows doing so with Accelerate:`,ce,M,de,W,fe,B,Ge=`You will also find that <code>accelerate</code> will step the learning rate based on the number of processes being trained on. This is because
of the observed batch size noted earlier. So in the case of 2 GPUs, the learning rate will be stepped twice as often as a single GPU
to account for the batch size being twice as large (if no changes to the batch size on the single GPU instance are made).`,he,H,ue,X,ke=`When using gradient accumulation and mixed precision, due to how gradient averaging works (accumulation) and the precision loss (mixed precision),
some degradation in performance is expected. This will be explicitly seen when comparing the batch-wise loss between different compute
setups. However, the overall loss, metric, and general performance at the end of training should be <em>roughly</em> the same.`,$e,E,ge,V,be;return v=new Ve({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),x=new R({props:{title:"Comparing performance across distributed setups",local:"comparing-performance-across-distributed-setups",headingTag:"h1"}}),j=new R({props:{title:"Setting the Seed",local:"setting-the-seed",headingTag:"h2"}}),P=new we({props:{code:"ZnJvbSUyMGFjY2VsZXJhdGUudXRpbHMlMjBpbXBvcnQlMjBzZXRfc2VlZCUwQSUwQXNldF9zZWVkKDQyKQ==",highlighted:`<span class="hljs-keyword">from</span> accelerate.utils <span class="hljs-keyword">import</span> set_seed
set_seed(<span class="hljs-number">42</span>)`,wrap:!1}}),G=new we({props:{code:"JTIwJTIwJTIwJTIwcmFuZG9tLnNlZWQoc2VlZCklMEElMjAlMjAlMjAlMjBucC5yYW5kb20uc2VlZChzZWVkKSUwQSUyMCUyMCUyMCUyMHRvcmNoLm1hbnVhbF9zZWVkKHNlZWQpJTBBJTIwJTIwJTIwJTIwdG9yY2guY3VkYS5tYW51YWxfc2VlZF9hbGwoc2VlZCklMjAlMjMlMjBvciUyMHRvcmNoLnhwdS5tYW51YWxfc2VlZF9hbGwlMkMlMjBldGMlMEElMjAlMjAlMjAlMjAlMjMlMjAlNUUlNUUlMjBzYWZlJTIwdG8lMjBjYWxsJTIwdGhpcyUyMGZ1bmN0aW9uJTIwZXZlbiUyMGlmJTIwY3VkYSUyMGlzJTIwbm90JTIwYXZhaWxhYmxlJTBBJTIwJTIwJTIwJTIwaWYlMjBpc190b3JjaF94bGFfYXZhaWxhYmxlKCklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB4bS5zZXRfcm5nX3N0YXRlKHNlZWQp",highlighted:` random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed) <span class="hljs-comment"># or torch.xpu.manual_seed_all, etc</span>
<span class="hljs-comment"># ^^ safe to call this function even if cuda is not available</span>
<span class="hljs-keyword">if</span> is_torch_xla_available():
xm.set_rng_state(seed)`,wrap:!1}}),Z=new R({props:{title:"Observed Batch Sizes",local:"observed-batch-sizes",headingTag:"h2"}}),w=new Ye({props:{$$slots:{default:[Fe]},$$scope:{ctx:I}}}),Y=new R({props:{title:"Learning Rates",local:"learning-rates",headingTag:"h2"}}),M=new Ye({props:{$$slots:{default:[Ne]},$$scope:{ctx:I}}}),W=new we({props:{code:"bGVhcm5pbmdfcmF0ZSUyMCUzRCUyMDFlLTMlMEFhY2NlbGVyYXRvciUyMCUzRCUyMEFjY2VsZXJhdG9yKCklMEFsZWFybmluZ19yYXRlJTIwKiUzRCUyMGFjY2VsZXJhdG9yLm51bV9wcm9jZXNzZXMlMEElMEFvcHRpbWl6ZXIlMjAlM0QlMjBBZGFtVyhwYXJhbXMlM0Rtb2RlbC5wYXJhbWV0ZXJzKCklMkMlMjBsciUzRGxlYXJuaW5nX3JhdGUp",highlighted:`learning_rate = <span class="hljs-number">1e-3</span>
accelerator = Accelerator()
learning_rate *= accelerator.num_processes
optimizer = AdamW(params=model.parameters(), lr=learning_rate)`,wrap:!1}}),H=new R({props:{title:"Gradient Accumulation and Mixed Precision",local:"gradient-accumulation-and-mixed-precision",headingTag:"h2"}}),E=new Re({props:{source:"https://github.com/huggingface/accelerate/blob/main/docs/source/concept_guides/performance.md"}}),{c(){i=r("meta"),g=n(),p=r("p"),b=n(),c(v.$$.fragment),F=n(),c(x.$$.fragment),N=n(),T=r("p"),T.textContent=Me,Q=n(),_=r("p"),_.textContent=ve,q=n(),C=r("p"),C.textContent=xe,K=n(),y=r("ol"),y.innerHTML=Te,O=n(),c(j.$$.fragment),D=n(),U=r("p"),U.innerHTML=_e,ee=n(),c(P.$$.fragment),te=n(),z=r("p"),z.innerHTML=Ce,le=n(),c(G.$$.fragment),se=n(),k=r("p"),k.textContent=ye,ne=n(),c(Z.$$.fragment),ae=n(),J=r("p"),J.innerHTML=je,ie=n(),L=r("p"),L.textContent=Ue,re=n(),c(w.$$.fragment),oe=n(),A=r("table"),A.innerHTML=Pe,pe=n(),c(Y.$$.fragment),me=n(),S=r("p"),S.innerHTML=ze,ce=n(),c(M.$$.fragment),de=n(),c(W.$$.fragment),fe=n(),B=r("p"),B.innerHTML=Ge,he=n(),c(H.$$.fragment),ue=n(),X=r("p"),X.innerHTML=ke,$e=n(),c(E.$$.fragment),ge=n(),V=r("p"),this.h()},l(e){const t=Ee("svelte-u9bgzb",document.head);i=o(t,"META",{name:!0,content:!0}),t.forEach(l),g=a(e),p=o(e,"P",{}),Le(p).forEach(l),b=a(e),d(v.$$.fragment,e),F=a(e),d(x.$$.fragment,e),N=a(e),T=o(e,"P",{"data-svelte-h":!0}),m(T)!=="svelte-1u8i7ng"&&(T.textContent=Me),Q=a(e),_=o(e,"P",{"data-svelte-h":!0}),m(_)!=="svelte-1pzk3n6"&&(_.textContent=ve),q=a(e),C=o(e,"P",{"data-svelte-h":!0}),m(C)!=="svelte-1px1vea"&&(C.textContent=xe),K=a(e),y=o(e,"OL",{"data-svelte-h":!0}),m(y)!=="svelte-1ezajja"&&(y.innerHTML=Te),O=a(e),d(j.$$.fragment,e),D=a(e),U=o(e,"P",{"data-svelte-h":!0}),m(U)!=="svelte-e1i2tj"&&(U.innerHTML=_e),ee=a(e),d(P.$$.fragment,e),te=a(e),z=o(e,"P",{"data-svelte-h":!0}),m(z)!=="svelte-bvqjor"&&(z.innerHTML=Ce),le=a(e),d(G.$$.fragment,e),se=a(e),k=o(e,"P",{"data-svelte-h":!0}),m(k)!=="svelte-19oel81"&&(k.textContent=ye),ne=a(e),d(Z.$$.fragment,e),ae=a(e),J=o(e,"P",{"data-svelte-h":!0}),m(J)!=="svelte-y1nchk"&&(J.innerHTML=je),ie=a(e),L=o(e,"P",{"data-svelte-h":!0}),m(L)!=="svelte-1pogoqw"&&(L.textContent=Ue),re=a(e),d(w.$$.fragment,e),oe=a(e),A=o(e,"TABLE",{"data-svelte-h":!0}),m(A)!=="svelte-4k3weo"&&(A.innerHTML=Pe),pe=a(e),d(Y.$$.fragment,e),me=a(e),S=o(e,"P",{"data-svelte-h":!0}),m(S)!=="svelte-1xakrqo"&&(S.innerHTML=ze),ce=a(e),d(M.$$.fragment,e),de=a(e),d(W.$$.fragment,e),fe=a(e),B=o(e,"P",{"data-svelte-h":!0}),m(B)!=="svelte-1z0alx"&&(B.innerHTML=Ge),he=a(e),d(H.$$.fragment,e),ue=a(e),X=o(e,"P",{"data-svelte-h":!0}),m(X)!=="svelte-1oucnsg"&&(X.innerHTML=ke),$e=a(e),d(E.$$.fragment,e),ge=a(e),V=o(e,"P",{}),Le(V).forEach(l),this.h()},h(){Ae(i,"name","hf:doc:metadata"),Ae(i,"content",qe)},m(e,t){Ie(document.head,i),s(e,g,t),s(e,p,t),s(e,b,t),f(v,e,t),s(e,F,t),f(x,e,t),s(e,N,t),s(e,T,t),s(e,Q,t),s(e,_,t),s(e,q,t),s(e,C,t),s(e,K,t),s(e,y,t),s(e,O,t),f(j,e,t),s(e,D,t),s(e,U,t),s(e,ee,t),f(P,e,t),s(e,te,t),s(e,z,t),s(e,le,t),f(G,e,t),s(e,se,t),s(e,k,t),s(e,ne,t),f(Z,e,t),s(e,ae,t),s(e,J,t),s(e,ie,t),s(e,L,t),s(e,re,t),f(w,e,t),s(e,oe,t),s(e,A,t),s(e,pe,t),f(Y,e,t),s(e,me,t),s(e,S,t),s(e,ce,t),f(M,e,t),s(e,de,t),f(W,e,t),s(e,fe,t),s(e,B,t),s(e,he,t),f(H,e,t),s(e,ue,t),s(e,X,t),s(e,$e,t),f(E,e,t),s(e,ge,t),s(e,V,t),be=!0},p(e,[t]){const Ze={};t&2&&(Ze.$$scope={dirty:t,ctx:e}),w.$set(Ze);const Je={};t&2&&(Je.$$scope={dirty:t,ctx:e}),M.$set(Je)},i(e){be||(h(v.$$.fragment,e),h(x.$$.fragment,e),h(j.$$.fragment,e),h(P.$$.fragment,e),h(G.$$.fragment,e),h(Z.$$.fragment,e),h(w.$$.fragment,e),h(Y.$$.fragment,e),h(M.$$.fragment,e),h(W.$$.fragment,e),h(H.$$.fragment,e),h(E.$$.fragment,e),be=!0)},o(e){u(v.$$.fragment,e),u(x.$$.fragment,e),u(j.$$.fragment,e),u(P.$$.fragment,e),u(G.$$.fragment,e),u(Z.$$.fragment,e),u(w.$$.fragment,e),u(Y.$$.fragment,e),u(M.$$.fragment,e),u(W.$$.fragment,e),u(H.$$.fragment,e),u(E.$$.fragment,e),be=!1},d(e){e&&(l(g),l(p),l(b),l(F),l(N),l(T),l(Q),l(_),l(q),l(C),l(K),l(y),l(O),l(D),l(U),l(ee),l(te),l(z),l(le),l(se),l(k),l(ne),l(ae),l(J),l(ie),l(L),l(re),l(oe),l(A),l(pe),l(me),l(S),l(ce),l(de),l(fe),l(B),l(he),l(ue),l(X),l($e),l(ge),l(V)),l(i),$(v,e),$(x,e),$(j,e),$(P,e),$(G,e),$(Z,e),$(w,e),$(Y,e),$(M,e),$(W,e),$(H,e),$(E,e)}}}const qe='{"title":"Comparing performance across distributed setups","local":"comparing-performance-across-distributed-setups","sections":[{"title":"Setting the Seed","local":"setting-the-seed","sections":[],"depth":2},{"title":"Observed Batch Sizes","local":"observed-batch-sizes","sections":[],"depth":2},{"title":"Learning Rates","local":"learning-rates","sections":[],"depth":2},{"title":"Gradient Accumulation and Mixed Precision","local":"gradient-accumulation-and-mixed-precision","sections":[],"depth":2}],"depth":1}';function Ke(I){return Be(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class st extends He{constructor(i){super(),Xe(this,i,Ke,Qe,We,{})}}export{st as component};

Xet Storage Details

Size:
11.4 kB
·
Xet hash:
5d32d6f9e6112a0d4927883f252fb097eecbaf42448ab635bccc480f88f9a37b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.