Buckets:
| import{s as yt,n as kt,o as wt}from"../chunks/scheduler.a2b4ca8e.js";import{S as Ct,i as Ht,g as s,s as l,r as Z,A as Mt,h as r,f as n,c as a,j as $t,u as tt,x as o,k as _t,y as Pt,a as i,v as et,d as nt,t as it,w as lt}from"../chunks/index.d2f673cc.js";import{H as at}from"../chunks/index.35ef470e.js";function St(st){let m,S,M,z,p,A,u,rt="How fast is Llama-3.1-8b on Inferentia2? Let’s figure out!",I,f,ot="For this benchmark we will use the following configurations:",j,h,mt="<thead><tr><th>Model type</th> <th>batch_size</th> <th>sequence_length</th></tr></thead> <tbody><tr><td>Llama3.1 8b BS1</td> <td>1</td> <td>4096</td></tr> <tr><td>Llama3.1 8b BS4</td> <td>4</td> <td>4096</td></tr> <tr><td>Llama3.1 8b BS8</td> <td>8</td> <td>4096</td></tr> <tr><td>Llama3.1 8b BS16</td> <td>16</td> <td>4096</td></tr> <tr><td>Llama3.1 8b BS32</td> <td>32</td> <td>4096</td></tr></tbody>",B,d,pt="<em>Note: all models are compiled to use 4 devices corresponding to 8 cores on the <code>inf2.48xlarge</code> instance.</em>",E,c,ut='<em>Note: please refer to the <a href="https://aws.amazon.com/ec2/instance-types/inf2/" rel="nofollow">inferentia2 product page</a> for details on the available instances.</em>',q,g,R,v,ft=`The time to first token is the time required to process the input tokens and generate the first output token. | |
| It is a very important metric, as it corresponds to the latency directly perceived by the user when streaming generated tokens.`,W,b,ht="We test the time to first token for increasing context sizes, from a typical Q/A usage, to heavy Retrieval Augmented Generation (RAG) use-cases.",F,T,dt="Time to first token is expressed in <strong>seconds</strong>.",G,x,ct='<img src="https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3.1-8b/ttft.png" alt="Llama3.1 8b inferentia2 TTFT" title="Time to first token"/>',N,L,U,$,gt="The inter-token latency corresponds to the average time elapsed between two generated tokens.",Q,_,vt="It is expressed in <strong>milliseconds</strong>.",D,y,bt='<img src="https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3.1-8b/latency.png" alt="Llama3.1 8b inferentia2 inter-token latency" title="Inter-token latency"/>',J,k,K,w,Tt=`Unlike some other benchmarks, we evaluate the throughput using generated tokens only, by dividing their number | |
| by the end-to-end latency.`,O,C,xt="Throughput is expressed in <strong>tokens/second</strong>.",V,H,Lt='<img src="https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3.1-8b/throughput.png" alt="Llama3.1 8b inferentia2 throughput" title="Throughput"/>',X,P,Y;return p=new at({props:{title:"Llama-3.1-8b performance on AWS Inferentia2 (Latency & Throughput)",local:"llama-31-8b-performance-on-aws-inferentia2-latency--throughput",headingTag:"h1"}}),g=new at({props:{title:"Time to first token",local:"time-to-first-token",headingTag:"h2"}}),L=new at({props:{title:"Inter-token Latency",local:"inter-token-latency",headingTag:"h2"}}),k=new at({props:{title:"Throughput",local:"throughput",headingTag:"h3"}}),{c(){m=s("meta"),S=l(),M=s("p"),z=l(),Z(p.$$.fragment),A=l(),u=s("p"),u.textContent=rt,I=l(),f=s("p"),f.textContent=ot,j=l(),h=s("table"),h.innerHTML=mt,B=l(),d=s("p"),d.innerHTML=pt,E=l(),c=s("p"),c.innerHTML=ut,q=l(),Z(g.$$.fragment),R=l(),v=s("p"),v.textContent=ft,W=l(),b=s("p"),b.textContent=ht,F=l(),T=s("p"),T.innerHTML=dt,G=l(),x=s("p"),x.innerHTML=ct,N=l(),Z(L.$$.fragment),U=l(),$=s("p"),$.textContent=gt,Q=l(),_=s("p"),_.innerHTML=vt,D=l(),y=s("p"),y.innerHTML=bt,J=l(),Z(k.$$.fragment),K=l(),w=s("p"),w.textContent=Tt,O=l(),C=s("p"),C.innerHTML=xt,V=l(),H=s("p"),H.innerHTML=Lt,X=l(),P=s("p"),this.h()},l(t){const e=Mt("svelte-u9bgzb",document.head);m=r(e,"META",{name:!0,content:!0}),e.forEach(n),S=a(t),M=r(t,"P",{}),$t(M).forEach(n),z=a(t),tt(p.$$.fragment,t),A=a(t),u=r(t,"P",{"data-svelte-h":!0}),o(u)!=="svelte-1r6vka8"&&(u.textContent=rt),I=a(t),f=r(t,"P",{"data-svelte-h":!0}),o(f)!=="svelte-18zzjby"&&(f.textContent=ot),j=a(t),h=r(t,"TABLE",{"data-svelte-h":!0}),o(h)!=="svelte-1lbtr7g"&&(h.innerHTML=mt),B=a(t),d=r(t,"P",{"data-svelte-h":!0}),o(d)!=="svelte-tktwzm"&&(d.innerHTML=pt),E=a(t),c=r(t,"P",{"data-svelte-h":!0}),o(c)!=="svelte-1zgafe"&&(c.innerHTML=ut),q=a(t),tt(g.$$.fragment,t),R=a(t),v=r(t,"P",{"data-svelte-h":!0}),o(v)!=="svelte-16f7nw9"&&(v.textContent=ft),W=a(t),b=r(t,"P",{"data-svelte-h":!0}),o(b)!=="svelte-e39hxg"&&(b.textContent=ht),F=a(t),T=r(t,"P",{"data-svelte-h":!0}),o(T)!=="svelte-1et04dj"&&(T.innerHTML=dt),G=a(t),x=r(t,"P",{"data-svelte-h":!0}),o(x)!=="svelte-a4whjy"&&(x.innerHTML=ct),N=a(t),tt(L.$$.fragment,t),U=a(t),$=r(t,"P",{"data-svelte-h":!0}),o($)!=="svelte-13h9pgw"&&($.textContent=gt),Q=a(t),_=r(t,"P",{"data-svelte-h":!0}),o(_)!=="svelte-1ahl198"&&(_.innerHTML=vt),D=a(t),y=r(t,"P",{"data-svelte-h":!0}),o(y)!=="svelte-ffe8jz"&&(y.innerHTML=bt),J=a(t),tt(k.$$.fragment,t),K=a(t),w=r(t,"P",{"data-svelte-h":!0}),o(w)!=="svelte-1a309vq"&&(w.textContent=Tt),O=a(t),C=r(t,"P",{"data-svelte-h":!0}),o(C)!=="svelte-14m9i5e"&&(C.innerHTML=xt),V=a(t),H=r(t,"P",{"data-svelte-h":!0}),o(H)!=="svelte-140gjuh"&&(H.innerHTML=Lt),X=a(t),P=r(t,"P",{}),$t(P).forEach(n),this.h()},h(){_t(m,"name","hf:doc:metadata"),_t(m,"content",zt)},m(t,e){Pt(document.head,m),i(t,S,e),i(t,M,e),i(t,z,e),et(p,t,e),i(t,A,e),i(t,u,e),i(t,I,e),i(t,f,e),i(t,j,e),i(t,h,e),i(t,B,e),i(t,d,e),i(t,E,e),i(t,c,e),i(t,q,e),et(g,t,e),i(t,R,e),i(t,v,e),i(t,W,e),i(t,b,e),i(t,F,e),i(t,T,e),i(t,G,e),i(t,x,e),i(t,N,e),et(L,t,e),i(t,U,e),i(t,$,e),i(t,Q,e),i(t,_,e),i(t,D,e),i(t,y,e),i(t,J,e),et(k,t,e),i(t,K,e),i(t,w,e),i(t,O,e),i(t,C,e),i(t,V,e),i(t,H,e),i(t,X,e),i(t,P,e),Y=!0},p:kt,i(t){Y||(nt(p.$$.fragment,t),nt(g.$$.fragment,t),nt(L.$$.fragment,t),nt(k.$$.fragment,t),Y=!0)},o(t){it(p.$$.fragment,t),it(g.$$.fragment,t),it(L.$$.fragment,t),it(k.$$.fragment,t),Y=!1},d(t){t&&(n(S),n(M),n(z),n(A),n(u),n(I),n(f),n(j),n(h),n(B),n(d),n(E),n(c),n(q),n(R),n(v),n(W),n(b),n(F),n(T),n(G),n(x),n(N),n(U),n($),n(Q),n(_),n(D),n(y),n(J),n(K),n(w),n(O),n(C),n(V),n(H),n(X),n(P)),n(m),lt(p,t),lt(g,t),lt(L,t),lt(k,t)}}}const zt='{"title":"Llama-3.1-8b performance on AWS Inferentia2 (Latency & Throughput)","local":"llama-31-8b-performance-on-aws-inferentia2-latency--throughput","sections":[{"title":"Time to first token","local":"time-to-first-token","sections":[],"depth":2},{"title":"Inter-token Latency","local":"inter-token-latency","sections":[{"title":"Throughput","local":"throughput","sections":[],"depth":3}],"depth":2}],"depth":1}';function At(st){return wt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Et extends Ct{constructor(m){super(),Ht(this,m,At,St,yt,{})}}export{Et as component}; | |
Xet Storage Details
- Size:
- 6.8 kB
- Xet hash:
- cc22115925a91795f12af6e83716813f51bcc36611b72dd03efd20da5ee35afc
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.