Buckets:

rtrm's picture
download
raw
14.5 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;About Inference Endpoints&quot;,&quot;local&quot;:&quot;about-inference-endpoints&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Inference Engines&quot;,&quot;local&quot;:&quot;inference-engines&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Under the Hood&quot;,&quot;local&quot;:&quot;under-the-hood&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Enterprise or Team Subscription&quot;,&quot;local&quot;:&quot;enterprise-or-team-subscription&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/inference-endpoints/pr_151/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/entry/start.56631b46.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/scheduler.eb244325.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/singletons.54c25bcd.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/index.3c23fb4b.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/paths.12ce0a18.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/entry/app.08bc0e6a.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/preload-helper.0ac538a1.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/index.661680a1.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/nodes/0.69485259.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/nodes/2.df9b4412.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.c047d438.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;About Inference Endpoints&quot;,&quot;local&quot;:&quot;about-inference-endpoints&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Inference Engines&quot;,&quot;local&quot;:&quot;inference-engines&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Under the Hood&quot;,&quot;local&quot;:&quot;under-the-hood&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Enterprise or Team Subscription&quot;,&quot;local&quot;:&quot;enterprise-or-team-subscription&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="about-inference-endpoints" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#about-inference-endpoints"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>About Inference Endpoints</span></h1> <p data-svelte-h="svelte-12egzkp">Inference Endpoints is a managed service to deploy your AI model to production. The infrastructure is managed and configured such that
you can focus on building your AI application.</p> <p data-svelte-h="svelte-pjhxe6">To get an AI model into production, you need three key components:</p> <ol data-svelte-h="svelte-cejzw3"><li><p><strong>Model Weights and Artifacts</strong>: These are the trained parameters and files that define your AI model, stored and versioned on the
Hugging Face Hub.</p></li> <li><p><strong>Inference Engine</strong>: This is the software that loads and runs your model to generate predictions. Popular engines include vLLM, TGI, and
others, each optimized for different use cases and performance needs.</p></li> <li><p><strong>Production Infrastructure</strong>: This is what Inference Endpoints is. A scalable, secure, and reliable environment where your model runs—handling
requests, scaling with demand, and ensuring uptime.</p></li></ol> <p data-svelte-h="svelte-1y1l12u">Inference Endpoints brings all these pieces together into a single managed service. You choose your model from the Hub, select the
inference engine, and Inference Endpoints takes care of the rest—provisioning infrastructure, deploying your model, and making it
accessible via a simple API. This lets you focus on building your application, while we handle the complexity of production AI deployment.</p> <p data-svelte-h="svelte-1s5ukr3"><img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/about.png" alt="about"></p> <h2 class="relative group"><a id="inference-engines" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#inference-engines"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Inference Engines</span></h2> <p data-svelte-h="svelte-59uh5h">To achieve that we’ve made Inference Endpoints the central place to deploy high performance and open-source Inference Engines.</p> <p data-svelte-h="svelte-1ygbjmf">Currently we have native support for:</p> <ul data-svelte-h="svelte-1i10e71"><li>vLLM</li> <li>Text-generation-inference (TGI)</li> <li>SGLang</li> <li>llama.cpp</li> <li>and Text-embeddings-inference (TEI)</li></ul> <p data-svelte-h="svelte-y0p6de">For the natively supported engines we try to set sensible defaults, expose the most relevant configuration settings and collaborate closely
with the teams maintaining the Inference Engines to make sure they are optimized for production performance.</p> <p data-svelte-h="svelte-l79z0z">If you don’t find your favourite engine here, please reach out to us at <a href="api-enterprise@huggingface.co">api-enterprise@huggingface.co</a>.</p> <h2 class="relative group"><a id="under-the-hood" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#under-the-hood"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Under the Hood</span></h2> <p data-svelte-h="svelte-po5x90">When you deploy an Inference Endpoint, under the hood your selected inference engine (like vLLM, TGI, SGLang, etc.) is packaged
and launched as a prebuilt Docker container. This container includes the inference engine software, your chosen model
weights and artifacts (downloaded directly from the Hugging Face Hub), and any configuration or environment variables you specify.</p> <p data-svelte-h="svelte-9jxghf">We manage the full lifecycle of these containers: starting, stopping, scaling (including autoscaling and scale-to-zero),
and monitoring them for health and performance. This orchestration is completely managed for you, so you don’t have to worry about
the complexities of containerization, networking, or cloud resource management.</p> <h2 class="relative group"><a id="enterprise-or-team-subscription" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#enterprise-or-team-subscription"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Enterprise or Team Subscription</span></h2> <p data-svelte-h="svelte-1ne9hjb">For more features consider subscribing to <a href="https://huggingface.co/enterprise" rel="nofollow">Team or Enterprise</a>.</p> <p data-svelte-h="svelte-1uam4qw">It gives your organization more control over access controls, dedicated support and more. Features include:</p> <ul data-svelte-h="svelte-1vxs7qh"><li>Higher quotas for the most performant GPUs</li> <li>Single Sign-on (SSO)</li> <li>Access to Audit Logs</li> <li>Manage teams and projects access controls with Resource Groups</li> <li>Private storage for your repositories</li> <li>Disable the ability to create public repositories (or make repositories private by default)</li> <li>You can request a quote for a contract-based-invoice which allows for more payment options + prepaid credits</li> <li>and more!</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/about.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_10wt9iy = {
assets: "/docs/inference-endpoints/pr_151/en",
base: "/docs/inference-endpoints/pr_151/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/inference-endpoints/pr_151/en/_app/immutable/entry/start.56631b46.js"),
import("/docs/inference-endpoints/pr_151/en/_app/immutable/entry/app.08bc0e6a.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 2],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
14.5 kB
·
Xet hash:
03c7223d334a09609bff0702588a02a7a572224380cfcc53973ea9002f722a3c

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.