Buckets:
| import{s as Mt,n as Ct,o as wt}from"../chunks/scheduler.a2b4ca8e.js";import{S as Lt,i as bt,g as a,s as l,r as Z,A as Ht,h as r,f as n,c as s,j as yt,u as tt,x as o,k as kt,y as Pt,a as i,v as et,d as nt,t as it,w as lt}from"../chunks/index.d2f673cc.js";import{H as st}from"../chunks/index.35ef470e.js";function St(at){let m,S,H,I,p,z,u,rt="How fast is Mistral on Inferentia2? Let’s figure out!",A,f,ot="For this benchmark we will use the following configurations:",E,h,mt="<thead><tr><th>Model type</th> <th>batch_size</th> <th>sequence_length</th></tr></thead> <tbody><tr><td>Mistral-Small BS1</td> <td>1</td> <td>4096</td></tr> <tr><td>Mistral-Small BS4</td> <td>4</td> <td>4096</td></tr></tbody>",j,c,pt="<em>Note: all models are compiled to use 6 devices corresponding to 12 cores on the <code>inf2.48xlarge</code> instance.</em>",q,d,ut='<em>Note: please refer to the <a href="https://aws.amazon.com/ec2/instance-types/inf2/" rel="nofollow">inferentia2 product page</a> for details on the available instances.</em>',B,g,R,v,ft=`The time to first token is the time required to process the input tokens and generate the first output token. | |
| It is a very important metric, as it corresponds to the latency directly perceived by the user when streaming generated tokens.`,W,T,ht="We test the time to first token for increasing context sizes, from a typical Q/A usage, to heavy Retrieval Augmented Generation (RAG) use-cases.",F,x,ct="Time to first token is expressed in <strong>seconds</strong>.",G,$,dt='<img src="https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-mistral-small/ttft.png" alt="Mistral Small inferentia2 TTFT" title="Time to first token"/>',N,_,U,y,gt="The inter-token latency corresponds to the average time elapsed between two generated tokens.",Q,k,vt="It is expressed in <strong>milliseconds</strong>.",D,M,Tt='<img src="https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-mistral-small/latency.png" alt="Mistral Small inferentia2 inter-token latency" title="Inter-token latency"/>',J,C,K,w,xt=`Unlike some other benchmarks, we evaluate the throughput using generated tokens only, by dividing their number | |
| by the end-to-end latency.`,O,L,$t="Throughput is expressed in <strong>tokens/second</strong>.",V,b,_t='<img src="https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-mistral-small/throughput.png" alt="Mistral Small inferentia2 throughput" title="Throughput"/>',X,P,Y;return p=new st({props:{title:"Mistral-Small-Instruct performance on AWS Inferentia2 (Latency & Throughput)",local:"mistral-small-instruct-performance-on-aws-inferentia2-latency--throughput",headingTag:"h1"}}),g=new st({props:{title:"Time to first token",local:"time-to-first-token",headingTag:"h2"}}),_=new st({props:{title:"Inter-token Latency",local:"inter-token-latency",headingTag:"h2"}}),C=new st({props:{title:"Throughput",local:"throughput",headingTag:"h3"}}),{c(){m=a("meta"),S=l(),H=a("p"),I=l(),Z(p.$$.fragment),z=l(),u=a("p"),u.textContent=rt,A=l(),f=a("p"),f.textContent=ot,E=l(),h=a("table"),h.innerHTML=mt,j=l(),c=a("p"),c.innerHTML=pt,q=l(),d=a("p"),d.innerHTML=ut,B=l(),Z(g.$$.fragment),R=l(),v=a("p"),v.textContent=ft,W=l(),T=a("p"),T.textContent=ht,F=l(),x=a("p"),x.innerHTML=ct,G=l(),$=a("p"),$.innerHTML=dt,N=l(),Z(_.$$.fragment),U=l(),y=a("p"),y.textContent=gt,Q=l(),k=a("p"),k.innerHTML=vt,D=l(),M=a("p"),M.innerHTML=Tt,J=l(),Z(C.$$.fragment),K=l(),w=a("p"),w.textContent=xt,O=l(),L=a("p"),L.innerHTML=$t,V=l(),b=a("p"),b.innerHTML=_t,X=l(),P=a("p"),this.h()},l(t){const e=Ht("svelte-u9bgzb",document.head);m=r(e,"META",{name:!0,content:!0}),e.forEach(n),S=s(t),H=r(t,"P",{}),yt(H).forEach(n),I=s(t),tt(p.$$.fragment,t),z=s(t),u=r(t,"P",{"data-svelte-h":!0}),o(u)!=="svelte-zsv685"&&(u.textContent=rt),A=s(t),f=r(t,"P",{"data-svelte-h":!0}),o(f)!=="svelte-18zzjby"&&(f.textContent=ot),E=s(t),h=r(t,"TABLE",{"data-svelte-h":!0}),o(h)!=="svelte-ybs2kf"&&(h.innerHTML=mt),j=s(t),c=r(t,"P",{"data-svelte-h":!0}),o(c)!=="svelte-13lk687"&&(c.innerHTML=pt),q=s(t),d=r(t,"P",{"data-svelte-h":!0}),o(d)!=="svelte-1zgafe"&&(d.innerHTML=ut),B=s(t),tt(g.$$.fragment,t),R=s(t),v=r(t,"P",{"data-svelte-h":!0}),o(v)!=="svelte-16f7nw9"&&(v.textContent=ft),W=s(t),T=r(t,"P",{"data-svelte-h":!0}),o(T)!=="svelte-e39hxg"&&(T.textContent=ht),F=s(t),x=r(t,"P",{"data-svelte-h":!0}),o(x)!=="svelte-1et04dj"&&(x.innerHTML=ct),G=s(t),$=r(t,"P",{"data-svelte-h":!0}),o($)!=="svelte-o63b9c"&&($.innerHTML=dt),N=s(t),tt(_.$$.fragment,t),U=s(t),y=r(t,"P",{"data-svelte-h":!0}),o(y)!=="svelte-13h9pgw"&&(y.textContent=gt),Q=s(t),k=r(t,"P",{"data-svelte-h":!0}),o(k)!=="svelte-1ahl198"&&(k.innerHTML=vt),D=s(t),M=r(t,"P",{"data-svelte-h":!0}),o(M)!=="svelte-p1pt39"&&(M.innerHTML=Tt),J=s(t),tt(C.$$.fragment,t),K=s(t),w=r(t,"P",{"data-svelte-h":!0}),o(w)!=="svelte-1a309vq"&&(w.textContent=xt),O=s(t),L=r(t,"P",{"data-svelte-h":!0}),o(L)!=="svelte-14m9i5e"&&(L.innerHTML=$t),V=s(t),b=r(t,"P",{"data-svelte-h":!0}),o(b)!=="svelte-o5uurj"&&(b.innerHTML=_t),X=s(t),P=r(t,"P",{}),yt(P).forEach(n),this.h()},h(){kt(m,"name","hf:doc:metadata"),kt(m,"content",It)},m(t,e){Pt(document.head,m),i(t,S,e),i(t,H,e),i(t,I,e),et(p,t,e),i(t,z,e),i(t,u,e),i(t,A,e),i(t,f,e),i(t,E,e),i(t,h,e),i(t,j,e),i(t,c,e),i(t,q,e),i(t,d,e),i(t,B,e),et(g,t,e),i(t,R,e),i(t,v,e),i(t,W,e),i(t,T,e),i(t,F,e),i(t,x,e),i(t,G,e),i(t,$,e),i(t,N,e),et(_,t,e),i(t,U,e),i(t,y,e),i(t,Q,e),i(t,k,e),i(t,D,e),i(t,M,e),i(t,J,e),et(C,t,e),i(t,K,e),i(t,w,e),i(t,O,e),i(t,L,e),i(t,V,e),i(t,b,e),i(t,X,e),i(t,P,e),Y=!0},p:Ct,i(t){Y||(nt(p.$$.fragment,t),nt(g.$$.fragment,t),nt(_.$$.fragment,t),nt(C.$$.fragment,t),Y=!0)},o(t){it(p.$$.fragment,t),it(g.$$.fragment,t),it(_.$$.fragment,t),it(C.$$.fragment,t),Y=!1},d(t){t&&(n(S),n(H),n(I),n(z),n(u),n(A),n(f),n(E),n(h),n(j),n(c),n(q),n(d),n(B),n(R),n(v),n(W),n(T),n(F),n(x),n(G),n($),n(N),n(U),n(y),n(Q),n(k),n(D),n(M),n(J),n(K),n(w),n(O),n(L),n(V),n(b),n(X),n(P)),n(m),lt(p,t),lt(g,t),lt(_,t),lt(C,t)}}}const It='{"title":"Mistral-Small-Instruct performance on AWS Inferentia2 (Latency & Throughput)","local":"mistral-small-instruct-performance-on-aws-inferentia2-latency--throughput","sections":[{"title":"Time to first token","local":"time-to-first-token","sections":[],"depth":2},{"title":"Inter-token Latency","local":"inter-token-latency","sections":[{"title":"Throughput","local":"throughput","sections":[],"depth":3}],"depth":2}],"depth":1}';function zt(at){return wt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class qt extends Lt{constructor(m){super(),bt(this,m,zt,St,Mt,{})}}export{qt as component}; | |
Xet Storage Details
- Size:
- 6.67 kB
- Xet hash:
- 4675d74cf62c78eaafd9c6385a527dbe840e90b1948a24c4eed5856b772a7c39
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.