Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / inference-endpoints /pr_152 /en /_app /immutable /nodes /12.01ac3d70.js

rtrm's picture

about 1 month ago

11.2 kB

	import{s as Ge,o as Ye}from"../chunks/scheduler.eb244325.js";import{S as Be,i as Xe,e as r,s as a,c,h as Ze,a as u,d as n,b as s,f as Ne,g as f,j as m,k as Fe,l as Oe,m as i,n as d,t as h,o as g,p as $,r as ve,u as be}from"../chunks/index.661680a1.js";import{T as we}from"../chunks/Tip.76637dd3.js";import{C as Re,H as X,E as We}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.83ba2125.js";function De(y){let l;return{c(){l=ve("In the Analytics section of the guide, you can read more about how to track all the metrics mentioned in this documentation.")},l(o){l=be(o,"In the Analytics section of the guide, you can read more about how to track all the metrics mentioned in this documentation.")},m(o,p){i(o,l,p)},d(o){o&&n(l)}}}function Je(y){let l;return{c(){l=ve(`Note that scaling up can take a few minutes depending on the model, which means that scaling from 0 to 1 based on a request is typically not recommended if your
	application needs to be responsive.`)},l(o){l=be(o,`Note that scaling up can take a few minutes depending on the model, which means that scaling from 0 to 1 based on a request is typically not recommended if your
	application needs to be responsive.`)},m(o,p){i(o,l,p)},d(o){o&&n(l)}}}function Ke(y){let l;return{c(){l=ve("Note that if scale to zero is enabled, the minimum number of replicas needs to be 0.")},l(o){l=be(o,"Note that if scale to zero is enabled, the minimum number of replicas needs to be 0.")},m(o,p){i(o,l,p)},d(o){o&&n(l)}}}function Qe(y){let l,o,p,O,x,R,_,W,C,xe=`Autoscaling allows you to dynamically adjust the number of endpoint replicas running your models based on traffic and hardware
	utilization. By leveraging autoscaling, you can seamlessly handle varying workloads while optimizing costs and ensuring high availability.`,D,T,_e="You can find the autoscaling setting for you endpoints under the “Settings tab” on the Inference Endpoint card.",J,z,Ce='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/autoscaling/settings.png" alt="settings"/>',K,w,Q,P,V,q,Te=`Scaling to zero means that your Inference Endpoint will go idle after a given duration (1 hour by default) of inactivity. This is typically
	very useful when you want to optimize for low costs or when your workloads are intermittent.`,ee,S,ze=`Scaling to zero replicas helps optimize cost savings by minimizing resource usage during periods of inactivity. However, it’s important to
	be aware that scaling to zero implies a cold start period when the endpoint receives a new request. Additionally, the proxy will
	respond with the status code <code>503</code> while the new replica is initializing. To potentially avoid this, you can also add the
	‘X-Scale-Up-Timeout’ header to your requests. This means that when the endpoint is scaling the proxy will hold the request until a replica
	is ready, or timeout after the specified amount of seconds. For example ‘X-Scale-Up-Timeout: 600’ would wait for 600 seconds.`,te,v,ne,L,ie,H,Pe=`With this setting, you can change the maximum and minimum amount of replicas. This means that you control the ceiling and the floor of your costs.
	Typically, you’d set the minimum to a value such that at the lowest amount of traffic, you’re still serving your users at an acceptable rate.
	And the maximum so that you stay within budget, but so that you can serve your users even at the highest points of traffic.`,ae,b,se,M,le,k,qe="For the autoscaling system to work well there needs to be a signal that tells when to scale up and down. For this we have two strategies.",oe,A,re,E,Se=`The autoscaling process is triggered based on the hardware utilization metrics. The criteria for scaling differ depending on the
	type of accelerator being used:`,ue,U,Le="<li><strong>CPU</strong>: A new replica is added when the average CPU utilization of all replicas reaches the threshold value (default 80%).</li> <li><strong>GPU</strong>: A new replica is added when the average GPU utilization of all replicas over a 1-minute window reaches the threshold value (default 80%).</li>",me,j,He=`It’s important to note that the scaling up process takes place every minute and scaling down takes place every 2 minutes. This
	frequency ensures a balance between responsiveness and stability of the autoscaling system, with a stabilization of 300 seconds
	once scaled down.`,pe,I,Me='You can also track the hardware utilization metrics in the Analytics tab, or read more about it <a href="./analytics#hardwareutilisation">here</a>.',ce,N,fe,F,ke=`In some cases, the hardware utilization is not a ‘fast’ enough metric. The reason is that hardware metrics are always slightly lagging from
	the actual requests. A metric that is more of a leading indicator is pending requests.`,de,G,Ae=`<li><strong>Pending requests</strong> are requests that have not yet received an HTTP status, meaning they include in-flight requests and requests currently being processed.</li> <li><strong>By default</strong>, if there are more than 1.5 pending requests per replica in the past 20 seconds, it triggers an autoscaling event and adds a replica to your deployment.
	You can adjust this threshold value to meet your specific requirements under Endpoint settings.</li>`,he,Y,Ee='Similarly to the hardware metrics, you can track the pending requests in the Analytics tab, or read more about it <a href="./analytics#pendingrequests">here</a>.',ge,B,$e,Z,ye;return x=new Re({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),_=new X({props:{title:"Autoscaling",local:"autoscaling",headingTag:"h1"}}),w=new we({props:{$$slots:{default:[De]},$$scope:{ctx:y}}}),P=new X({props:{title:"Scale to Zero",local:"scale-to-zero",headingTag:"h2"}}),v=new we({props:{$$slots:{default:[Je]},$$scope:{ctx:y}}}),L=new X({props:{title:"Number of replicas",local:"number-of-replicas",headingTag:"h2"}}),b=new we({props:{$$slots:{default:[Ke]},$$scope:{ctx:y}}}),M=new X({props:{title:"Autoscaling Strategy",local:"autoscaling-strategy",headingTag:"h2"}}),A=new X({props:{title:"Scaling based on hardware utilization",local:"scaling-based-on-hardware-utilization",headingTag:"h3"}}),N=new X({props:{title:"Scaling based on pending requests",local:"scaling-based-on-pending-requests",headingTag:"h3"}}),B=new We({props:{source:"https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/guides/autoscaling.md"}}),{c(){l=r("meta"),o=a(),p=r("p"),O=a(),c(x.$$.fragment),R=a(),c(_.$$.fragment),W=a(),C=r("p"),C.textContent=xe,D=a(),T=r("p"),T.textContent=_e,J=a(),z=r("p"),z.innerHTML=Ce,K=a(),c(w.$$.fragment),Q=a(),c(P.$$.fragment),V=a(),q=r("p"),q.textContent=Te,ee=a(),S=r("p"),S.innerHTML=ze,te=a(),c(v.$$.fragment),ne=a(),c(L.$$.fragment),ie=a(),H=r("p"),H.textContent=Pe,ae=a(),c(b.$$.fragment),se=a(),c(M.$$.fragment),le=a(),k=r("p"),k.textContent=qe,oe=a(),c(A.$$.fragment),re=a(),E=r("p"),E.textContent=Se,ue=a(),U=r("ul"),U.innerHTML=Le,me=a(),j=r("p"),j.textContent=He,pe=a(),I=r("p"),I.innerHTML=Me,ce=a(),c(N.$$.fragment),fe=a(),F=r("p"),F.textContent=ke,de=a(),G=r("ul"),G.innerHTML=Ae,he=a(),Y=r("p"),Y.innerHTML=Ee,ge=a(),c(B.$$.fragment),$e=a(),Z=r("p"),this.h()},l(e){const t=Ze("svelte-u9bgzb",document.head);l=u(t,"META",{name:!0,content:!0}),t.forEach(n),o=s(e),p=u(e,"P",{}),Ne(p).forEach(n),O=s(e),f(x.$$.fragment,e),R=s(e),f(_.$$.fragment,e),W=s(e),C=u(e,"P",{"data-svelte-h":!0}),m(C)!=="svelte-n9mkj5"&&(C.textContent=xe),D=s(e),T=u(e,"P",{"data-svelte-h":!0}),m(T)!=="svelte-ums6bn"&&(T.textContent=_e),J=s(e),z=u(e,"P",{"data-svelte-h":!0}),m(z)!=="svelte-cj1mce"&&(z.innerHTML=Ce),K=s(e),f(w.$$.fragment,e),Q=s(e),f(P.$$.fragment,e),V=s(e),q=u(e,"P",{"data-svelte-h":!0}),m(q)!=="svelte-1nbwpml"&&(q.textContent=Te),ee=s(e),S=u(e,"P",{"data-svelte-h":!0}),m(S)!=="svelte-i6wd12"&&(S.innerHTML=ze),te=s(e),f(v.$$.fragment,e),ne=s(e),f(L.$$.fragment,e),ie=s(e),H=u(e,"P",{"data-svelte-h":!0}),m(H)!=="svelte-1w6gfuq"&&(H.textContent=Pe),ae=s(e),f(b.$$.fragment,e),se=s(e),f(M.$$.fragment,e),le=s(e),k=u(e,"P",{"data-svelte-h":!0}),m(k)!=="svelte-37hx4r"&&(k.textContent=qe),oe=s(e),f(A.$$.fragment,e),re=s(e),E=u(e,"P",{"data-svelte-h":!0}),m(E)!=="svelte-fmk3bj"&&(E.textContent=Se),ue=s(e),U=u(e,"UL",{"data-svelte-h":!0}),m(U)!=="svelte-1ud0nw9"&&(U.innerHTML=Le),me=s(e),j=u(e,"P",{"data-svelte-h":!0}),m(j)!=="svelte-159ttyk"&&(j.textContent=He),pe=s(e),I=u(e,"P",{"data-svelte-h":!0}),m(I)!=="svelte-1hsernu"&&(I.innerHTML=Me),ce=s(e),f(N.$$.fragment,e),fe=s(e),F=u(e,"P",{"data-svelte-h":!0}),m(F)!=="svelte-18cmjd7"&&(F.textContent=ke),de=s(e),G=u(e,"UL",{"data-svelte-h":!0}),m(G)!=="svelte-1iexpdh"&&(G.innerHTML=Ae),he=s(e),Y=u(e,"P",{"data-svelte-h":!0}),m(Y)!=="svelte-fl6ys6"&&(Y.innerHTML=Ee),ge=s(e),f(B.$$.fragment,e),$e=s(e),Z=u(e,"P",{}),Ne(Z).forEach(n),this.h()},h(){Fe(l,"name","hf:doc:metadata"),Fe(l,"content",Ve)},m(e,t){Oe(document.head,l),i(e,o,t),i(e,p,t),i(e,O,t),d(x,e,t),i(e,R,t),d(_,e,t),i(e,W,t),i(e,C,t),i(e,D,t),i(e,T,t),i(e,J,t),i(e,z,t),i(e,K,t),d(w,e,t),i(e,Q,t),d(P,e,t),i(e,V,t),i(e,q,t),i(e,ee,t),i(e,S,t),i(e,te,t),d(v,e,t),i(e,ne,t),d(L,e,t),i(e,ie,t),i(e,H,t),i(e,ae,t),d(b,e,t),i(e,se,t),d(M,e,t),i(e,le,t),i(e,k,t),i(e,oe,t),d(A,e,t),i(e,re,t),i(e,E,t),i(e,ue,t),i(e,U,t),i(e,me,t),i(e,j,t),i(e,pe,t),i(e,I,t),i(e,ce,t),d(N,e,t),i(e,fe,t),i(e,F,t),i(e,de,t),i(e,G,t),i(e,he,t),i(e,Y,t),i(e,ge,t),d(B,e,t),i(e,$e,t),i(e,Z,t),ye=!0},p(e,[t]){const Ue={};t&2&&(Ue.$$scope={dirty:t,ctx:e}),w.$set(Ue);const je={};t&2&&(je.$$scope={dirty:t,ctx:e}),v.$set(je);const Ie={};t&2&&(Ie.$$scope={dirty:t,ctx:e}),b.$set(Ie)},i(e){ye\|\|(h(x.$$.fragment,e),h(_.$$.fragment,e),h(w.$$.fragment,e),h(P.$$.fragment,e),h(v.$$.fragment,e),h(L.$$.fragment,e),h(b.$$.fragment,e),h(M.$$.fragment,e),h(A.$$.fragment,e),h(N.$$.fragment,e),h(B.$$.fragment,e),ye=!0)},o(e){g(x.$$.fragment,e),g(_.$$.fragment,e),g(w.$$.fragment,e),g(P.$$.fragment,e),g(v.$$.fragment,e),g(L.$$.fragment,e),g(b.$$.fragment,e),g(M.$$.fragment,e),g(A.$$.fragment,e),g(N.$$.fragment,e),g(B.$$.fragment,e),ye=!1},d(e){e&&(n(o),n(p),n(O),n(R),n(W),n(C),n(D),n(T),n(J),n(z),n(K),n(Q),n(V),n(q),n(ee),n(S),n(te),n(ne),n(ie),n(H),n(ae),n(se),n(le),n(k),n(oe),n(re),n(E),n(ue),n(U),n(me),n(j),n(pe),n(I),n(ce),n(fe),n(F),n(de),n(G),n(he),n(Y),n(ge),n($e),n(Z)),n(l),$(x,e),$(_,e),$(w,e),$(P,e),$(v,e),$(L,e),$(b,e),$(M,e),$(A,e),$(N,e),$(B,e)}}}const Ve='{"title":"Autoscaling","local":"autoscaling","sections":[{"title":"Scale to Zero","local":"scale-to-zero","sections":[],"depth":2},{"title":"Number of replicas","local":"number-of-replicas","sections":[],"depth":2},{"title":"Autoscaling Strategy","local":"autoscaling-strategy","sections":[{"title":"Scaling based on hardware utilization","local":"scaling-based-on-hardware-utilization","sections":[],"depth":3},{"title":"Scaling based on pending requests","local":"scaling-based-on-pending-requests","sections":[],"depth":3}],"depth":2}],"depth":1}';function et(y){return Ye(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class st extends Be{constructor(l){super(),Xe(this,l,et,Qe,Ge,{})}}export{st as component};

Xet Storage Details

Size:: 11.2 kB
Xet hash:: e6905e8b31d844abbc9de3fc047f1a9d8916da24ecc7aa07ca0ea1e062e01a25

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.