Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / inference-endpoints /pr_89 /en /autoscaling.html

rtrm

about 2 months ago

download

raw

11.3 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Autoscaling","local":"autoscaling","sections":[{"title":"Scaling Criteria","local":"scaling-criteria","sections":[],"depth":2},{"title":"Considerations for Effective Autoscaling","local":"considerations-for-effective-autoscaling","sections":[],"depth":2},{"title":"Scaling to 0","local":"scaling-to-0","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/inference-endpoints/pr_89/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_89/en/_app/immutable/entry/start.c3c0728f.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_89/en/_app/immutable/chunks/scheduler.389d799c.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_89/en/_app/immutable/chunks/singletons.e1a4dd0d.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_89/en/_app/immutable/chunks/paths.cdf3f928.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_89/en/_app/immutable/entry/app.d2265bc6.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_89/en/_app/immutable/chunks/index.8f81d18f.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_89/en/_app/immutable/nodes/0.22e9b8dd.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_89/en/_app/immutable/nodes/3.da142465.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_89/en/_app/immutable/chunks/EditOnGithub.33306dfe.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Autoscaling","local":"autoscaling","sections":[{"title":"Scaling Criteria","local":"scaling-criteria","sections":[],"depth":2},{"title":"Considerations for Effective Autoscaling","local":"considerations-for-effective-autoscaling","sections":[],"depth":2},{"title":"Scaling to 0","local":"scaling-to-0","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="autoscaling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#autoscaling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Autoscaling</span></h1> <p data-svelte-h="svelte-1cik5b8">Autoscaling allows you to dynamically adjust the number of endpoint replicas running your models based on traffic and accelerator utilization. By leveraging autoscaling, you can seamlessly handle varying workloads while optimizing costs and ensuring high availability.</p> <h2 class="relative group"><a id="scaling-criteria" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#scaling-criteria"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Scaling Criteria</span></h2> <p data-svelte-h="svelte-15ptey4">The autoscaling process is triggered based on the accelerator’s utilization metrics. The criteria for scaling differ depending on the type of accelerator being used:</p> <ul data-svelte-h="svelte-aptcki"><li><p><strong>CPU Accelerators</strong>: A new replica is added when the average CPU utilization of all replicas reaches 80%.</p></li> <li><p><strong>GPU Accelerators</strong>: A new replica is added when the average GPU utilization of all replicas over a 2-minute window reaches 80%.</p></li></ul> <p data-svelte-h="svelte-1didbzs">It’s important to note that the scaling up process takes place every minute, while the scaling down process takes 2 minutes. This frequency ensures a balance between responsiveness and stability of the autoscaling system, with a stabilization of 300 seconds once scaled up or down.</p> <h2 class="relative group"><a id="considerations-for-effective-autoscaling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#considerations-for-effective-autoscaling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Considerations for Effective Autoscaling</span></h2> <p data-svelte-h="svelte-m86usn">While autoscaling offers convenient resource management, certain considerations should be kept in mind to ensure its effectiveness:</p> <ul data-svelte-h="svelte-mm7v54"><li><p><strong>Model Initialization Time</strong>: During the initialization of a new replica, the model is downloaded and loaded into memory. If your replicas have a long initialization time, autoscaling may not be as effective. This is because the average GPU utilization might fall below the threshold during that time, triggering the automatic scaling down of your endpoint.</p></li> <li><p><strong>Enterprise Plan Control</strong>: If you have an <a href="https://huggingface.co/inference-endpoints/enterprise" rel="nofollow">enterprise plan</a>, you have full control over the autoscaling definitions. This allows you to customize the scaling thresholds, behavior and criteria based on your specific requirements.</p></li></ul> <h2 class="relative group"><a id="scaling-to-0" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#scaling-to-0"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Scaling to 0</span></h2> <p data-svelte-h="svelte-15noffw">Inference Endpoints also supports autoscaling to 0, which means reducing the number of replicas to 0 when there is no incoming traffic. This feature is based on request patterns rather than accelerator utilization. When an endpoint remains idle without receiving any requests for over 15 minutes, the system automatically scales down the endpoint to 0 replicas. To enable the feature, go to the Settings page and you’ll find a section called “Automatic Scale-to-Zero”.</p> <p data-svelte-h="svelte-rmhk41">Scaling to 0 replicas helps optimize cost savings by minimizing resource usage during periods of inactivity. However, it’s important to be aware that scaling to 0 implies a cold start period when the endpoint receives a new request. Additionally, the HTTP server will respond with a status code <code>502 Bad Gateway</code> while the new replica is initializing. Please note that there is currently no queueing system in place for incoming requests. Therefore, we recommend developing your own request queue client-side with proper error handling to optimize throughput and latency.</p> <p data-svelte-h="svelte-11efh16">The duration of the cold start period varies depending on your model’s size. It is recommended to consider the potential latency impact when enabling scaling to 0 and managing user expectations.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/autoscaling.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1egnsmt = {
	assets: "/docs/inference-endpoints/pr_89/en",
	base: "/docs/inference-endpoints/pr_89/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/inference-endpoints/pr_89/en/_app/immutable/entry/start.c3c0728f.js"),
	import("/docs/inference-endpoints/pr_89/en/_app/immutable/entry/app.d2265bc6.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 3],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 11.3 kB
Xet hash:: 03391a050e58d864da0fd3afdd12fbce6a05b27cdb28fd7b8a27b6af8e0160a4

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.