Buckets:
| import{s as Ge,o as Ye}from"../chunks/scheduler.eb244325.js";import{S as Be,i as Xe,e as r,s as a,c,h as Ze,a as u,d as n,b as s,f as Ne,g as f,j as m,k as Fe,l as Oe,m as i,n as d,t as h,o as g,p as $,r as ve,u as be}from"../chunks/index.661680a1.js";import{T as we}from"../chunks/Tip.76637dd3.js";import{C as Re,H as X,E as We}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.83ba2125.js";function De(y){let l;return{c(){l=ve("In the Analytics section of the guide, you can read more about how to track all the metrics mentioned in this documentation.")},l(o){l=be(o,"In the Analytics section of the guide, you can read more about how to track all the metrics mentioned in this documentation.")},m(o,p){i(o,l,p)},d(o){o&&n(l)}}}function Je(y){let l;return{c(){l=ve(`Note that scaling up can take a few minutes depending on the model, which means that scaling from 0 to 1 based on a request is typically not recommended if your | |
| application needs to be responsive.`)},l(o){l=be(o,`Note that scaling up can take a few minutes depending on the model, which means that scaling from 0 to 1 based on a request is typically not recommended if your | |
| application needs to be responsive.`)},m(o,p){i(o,l,p)},d(o){o&&n(l)}}}function Ke(y){let l;return{c(){l=ve("Note that if scale to zero is enabled, the minimum number of replicas needs to be 0.")},l(o){l=be(o,"Note that if scale to zero is enabled, the minimum number of replicas needs to be 0.")},m(o,p){i(o,l,p)},d(o){o&&n(l)}}}function Qe(y){let l,o,p,O,x,R,_,W,C,xe=`Autoscaling allows you to dynamically adjust the number of endpoint replicas running your models based on traffic and hardware | |
| utilization. By leveraging autoscaling, you can seamlessly handle varying workloads while optimizing costs and ensuring high availability.`,D,T,_e="You can find the autoscaling setting for you endpoints under the “Settings tab” on the Inference Endpoint card.",J,z,Ce='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/autoscaling/settings.png" alt="settings"/>',K,w,Q,P,V,q,Te=`Scaling to zero means that your Inference Endpoint will go idle after a given duration (1 hour by default) of inactivity. This is typically | |
| very useful when you want to optimize for low costs or when your workloads are intermittent.`,ee,S,ze=`Scaling to zero replicas helps optimize cost savings by minimizing resource usage during periods of inactivity. However, it’s important to | |
| be aware that scaling to zero implies a cold start period when the endpoint receives a new request. Additionally, the proxy will | |
| respond with the status code <code>503</code> while the new replica is initializing. To potentially avoid this, you can also add the | |
| ‘X-Scale-Up-Timeout’ header to your requests. This means that when the endpoint is scaling the proxy will hold the request until a replica | |
| is ready, or timeout after the specified amount of seconds. For example ‘X-Scale-Up-Timeout: 600’ would wait for 600 seconds.`,te,v,ne,L,ie,H,Pe=`With this setting, you can change the maximum and minimum amount of replicas. This means that you control the ceiling and the floor of your costs. | |
| Typically, you’d set the minimum to a value such that at the lowest amount of traffic, you’re still serving your users at an acceptable rate. | |
| And the maximum so that you stay within budget, but so that you can serve your users even at the highest points of traffic.`,ae,b,se,M,le,k,qe="For the autoscaling system to work well there needs to be a signal that tells when to scale up and down. For this we have two strategies.",oe,A,re,E,Se=`The autoscaling process is triggered based on the hardware utilization metrics. The criteria for scaling differ depending on the | |
| type of accelerator being used:`,ue,U,Le="<li><strong>CPU</strong>: A new replica is added when the average CPU utilization of all replicas reaches the threshold value (default 80%).</li> <li><strong>GPU</strong>: A new replica is added when the average GPU utilization of all replicas over a 1-minute window reaches the threshold value (default 80%).</li>",me,j,He=`It’s important to note that the scaling up process takes place every minute and scaling down takes place every 2 minutes. This | |
| frequency ensures a balance between responsiveness and stability of the autoscaling system, with a stabilization of 300 seconds | |
| once scaled down.`,pe,I,Me='You can also track the hardware utilization metrics in the Analytics tab, or read more about it <a href="./analytics#hardwareutilisation">here</a>.',ce,N,fe,F,ke=`In some cases, the hardware utilization is not a ‘fast’ enough metric. The reason is that hardware metrics are always slightly lagging from | |
| the actual requests. A metric that is more of a leading indicator is pending requests.`,de,G,Ae=`<li><strong>Pending requests</strong> are requests that have not yet received an HTTP status, meaning they include in-flight requests and requests currently being processed.</li> <li><strong>By default</strong>, if there are more than 1.5 pending requests per replica in the past 20 seconds, it triggers an autoscaling event and adds a replica to your deployment. | |
| You can adjust this threshold value to meet your specific requirements under Endpoint settings.</li>`,he,Y,Ee='Similarly to the hardware metrics, you can track the pending requests in the Analytics tab, or read more about it <a href="./analytics#pendingrequests">here</a>.',ge,B,$e,Z,ye;return x=new Re({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),_=new X({props:{title:"Autoscaling",local:"autoscaling",headingTag:"h1"}}),w=new we({props:{$$slots:{default:[De]},$$scope:{ctx:y}}}),P=new X({props:{title:"Scale to Zero",local:"scale-to-zero",headingTag:"h2"}}),v=new we({props:{$$slots:{default:[Je]},$$scope:{ctx:y}}}),L=new X({props:{title:"Number of replicas",local:"number-of-replicas",headingTag:"h2"}}),b=new we({props:{$$slots:{default:[Ke]},$$scope:{ctx:y}}}),M=new X({props:{title:"Autoscaling Strategy",local:"autoscaling-strategy",headingTag:"h2"}}),A=new X({props:{title:"Scaling based on hardware utilization",local:"scaling-based-on-hardware-utilization",headingTag:"h3"}}),N=new X({props:{title:"Scaling based on pending requests",local:"scaling-based-on-pending-requests",headingTag:"h3"}}),B=new We({props:{source:"https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/guides/autoscaling.md"}}),{c(){l=r("meta"),o=a(),p=r("p"),O=a(),c(x.$$.fragment),R=a(),c(_.$$.fragment),W=a(),C=r("p"),C.textContent=xe,D=a(),T=r("p"),T.textContent=_e,J=a(),z=r("p"),z.innerHTML=Ce,K=a(),c(w.$$.fragment),Q=a(),c(P.$$.fragment),V=a(),q=r("p"),q.textContent=Te,ee=a(),S=r("p"),S.innerHTML=ze,te=a(),c(v.$$.fragment),ne=a(),c(L.$$.fragment),ie=a(),H=r("p"),H.textContent=Pe,ae=a(),c(b.$$.fragment),se=a(),c(M.$$.fragment),le=a(),k=r("p"),k.textContent=qe,oe=a(),c(A.$$.fragment),re=a(),E=r("p"),E.textContent=Se,ue=a(),U=r("ul"),U.innerHTML=Le,me=a(),j=r("p"),j.textContent=He,pe=a(),I=r("p"),I.innerHTML=Me,ce=a(),c(N.$$.fragment),fe=a(),F=r("p"),F.textContent=ke,de=a(),G=r("ul"),G.innerHTML=Ae,he=a(),Y=r("p"),Y.innerHTML=Ee,ge=a(),c(B.$$.fragment),$e=a(),Z=r("p"),this.h()},l(e){const t=Ze("svelte-u9bgzb",document.head);l=u(t,"META",{name:!0,content:!0}),t.forEach(n),o=s(e),p=u(e,"P",{}),Ne(p).forEach(n),O=s(e),f(x.$$.fragment,e),R=s(e),f(_.$$.fragment,e),W=s(e),C=u(e,"P",{"data-svelte-h":!0}),m(C)!=="svelte-n9mkj5"&&(C.textContent=xe),D=s(e),T=u(e,"P",{"data-svelte-h":!0}),m(T)!=="svelte-ums6bn"&&(T.textContent=_e),J=s(e),z=u(e,"P",{"data-svelte-h":!0}),m(z)!=="svelte-cj1mce"&&(z.innerHTML=Ce),K=s(e),f(w.$$.fragment,e),Q=s(e),f(P.$$.fragment,e),V=s(e),q=u(e,"P",{"data-svelte-h":!0}),m(q)!=="svelte-1nbwpml"&&(q.textContent=Te),ee=s(e),S=u(e,"P",{"data-svelte-h":!0}),m(S)!=="svelte-i6wd12"&&(S.innerHTML=ze),te=s(e),f(v.$$.fragment,e),ne=s(e),f(L.$$.fragment,e),ie=s(e),H=u(e,"P",{"data-svelte-h":!0}),m(H)!=="svelte-1w6gfuq"&&(H.textContent=Pe),ae=s(e),f(b.$$.fragment,e),se=s(e),f(M.$$.fragment,e),le=s(e),k=u(e,"P",{"data-svelte-h":!0}),m(k)!=="svelte-37hx4r"&&(k.textContent=qe),oe=s(e),f(A.$$.fragment,e),re=s(e),E=u(e,"P",{"data-svelte-h":!0}),m(E)!=="svelte-fmk3bj"&&(E.textContent=Se),ue=s(e),U=u(e,"UL",{"data-svelte-h":!0}),m(U)!=="svelte-1ud0nw9"&&(U.innerHTML=Le),me=s(e),j=u(e,"P",{"data-svelte-h":!0}),m(j)!=="svelte-159ttyk"&&(j.textContent=He),pe=s(e),I=u(e,"P",{"data-svelte-h":!0}),m(I)!=="svelte-1hsernu"&&(I.innerHTML=Me),ce=s(e),f(N.$$.fragment,e),fe=s(e),F=u(e,"P",{"data-svelte-h":!0}),m(F)!=="svelte-18cmjd7"&&(F.textContent=ke),de=s(e),G=u(e,"UL",{"data-svelte-h":!0}),m(G)!=="svelte-1iexpdh"&&(G.innerHTML=Ae),he=s(e),Y=u(e,"P",{"data-svelte-h":!0}),m(Y)!=="svelte-fl6ys6"&&(Y.innerHTML=Ee),ge=s(e),f(B.$$.fragment,e),$e=s(e),Z=u(e,"P",{}),Ne(Z).forEach(n),this.h()},h(){Fe(l,"name","hf:doc:metadata"),Fe(l,"content",Ve)},m(e,t){Oe(document.head,l),i(e,o,t),i(e,p,t),i(e,O,t),d(x,e,t),i(e,R,t),d(_,e,t),i(e,W,t),i(e,C,t),i(e,D,t),i(e,T,t),i(e,J,t),i(e,z,t),i(e,K,t),d(w,e,t),i(e,Q,t),d(P,e,t),i(e,V,t),i(e,q,t),i(e,ee,t),i(e,S,t),i(e,te,t),d(v,e,t),i(e,ne,t),d(L,e,t),i(e,ie,t),i(e,H,t),i(e,ae,t),d(b,e,t),i(e,se,t),d(M,e,t),i(e,le,t),i(e,k,t),i(e,oe,t),d(A,e,t),i(e,re,t),i(e,E,t),i(e,ue,t),i(e,U,t),i(e,me,t),i(e,j,t),i(e,pe,t),i(e,I,t),i(e,ce,t),d(N,e,t),i(e,fe,t),i(e,F,t),i(e,de,t),i(e,G,t),i(e,he,t),i(e,Y,t),i(e,ge,t),d(B,e,t),i(e,$e,t),i(e,Z,t),ye=!0},p(e,[t]){const Ue={};t&2&&(Ue.$$scope={dirty:t,ctx:e}),w.$set(Ue);const je={};t&2&&(je.$$scope={dirty:t,ctx:e}),v.$set(je);const Ie={};t&2&&(Ie.$$scope={dirty:t,ctx:e}),b.$set(Ie)},i(e){ye||(h(x.$$.fragment,e),h(_.$$.fragment,e),h(w.$$.fragment,e),h(P.$$.fragment,e),h(v.$$.fragment,e),h(L.$$.fragment,e),h(b.$$.fragment,e),h(M.$$.fragment,e),h(A.$$.fragment,e),h(N.$$.fragment,e),h(B.$$.fragment,e),ye=!0)},o(e){g(x.$$.fragment,e),g(_.$$.fragment,e),g(w.$$.fragment,e),g(P.$$.fragment,e),g(v.$$.fragment,e),g(L.$$.fragment,e),g(b.$$.fragment,e),g(M.$$.fragment,e),g(A.$$.fragment,e),g(N.$$.fragment,e),g(B.$$.fragment,e),ye=!1},d(e){e&&(n(o),n(p),n(O),n(R),n(W),n(C),n(D),n(T),n(J),n(z),n(K),n(Q),n(V),n(q),n(ee),n(S),n(te),n(ne),n(ie),n(H),n(ae),n(se),n(le),n(k),n(oe),n(re),n(E),n(ue),n(U),n(me),n(j),n(pe),n(I),n(ce),n(fe),n(F),n(de),n(G),n(he),n(Y),n(ge),n($e),n(Z)),n(l),$(x,e),$(_,e),$(w,e),$(P,e),$(v,e),$(L,e),$(b,e),$(M,e),$(A,e),$(N,e),$(B,e)}}}const Ve='{"title":"Autoscaling","local":"autoscaling","sections":[{"title":"Scale to Zero","local":"scale-to-zero","sections":[],"depth":2},{"title":"Number of replicas","local":"number-of-replicas","sections":[],"depth":2},{"title":"Autoscaling Strategy","local":"autoscaling-strategy","sections":[{"title":"Scaling based on hardware utilization","local":"scaling-based-on-hardware-utilization","sections":[],"depth":3},{"title":"Scaling based on pending requests","local":"scaling-based-on-pending-requests","sections":[],"depth":3}],"depth":2}],"depth":1}';function et(y){return Ye(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class st extends Be{constructor(l){super(),Xe(this,l,et,Qe,Ge,{})}}export{st as component}; | |
Xet Storage Details
- Size:
- 11.2 kB
- Xet hash:
- e6905e8b31d844abbc9de3fc047f1a9d8916da24ecc7aa07ca0ea1e062e01a25
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.