Buckets:

rtrm's picture
download
raw
15.1 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Inference Endpoints&quot;,&quot;local&quot;:&quot;inference-endpoints&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Why use Inference Endpoints&quot;,&quot;local&quot;:&quot;why-use-inference-endpoints&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Key Features&quot;,&quot;local&quot;:&quot;key-features&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Further Reading&quot;,&quot;local&quot;:&quot;further-reading&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/inference-endpoints/pr_136/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/entry/start.fb9ab4d6.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/scheduler.f6b352c8.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/singletons.ceca4163.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/index.26cf6c5a.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/paths.142cd5df.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/entry/app.6247727a.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/index.b90df637.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/nodes/0.2fcde12d.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/nodes/18.5f39985c.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/getInferenceSnippets.1e3ae0bf.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Inference Endpoints&quot;,&quot;local&quot;:&quot;inference-endpoints&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Why use Inference Endpoints&quot;,&quot;local&quot;:&quot;why-use-inference-endpoints&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Key Features&quot;,&quot;local&quot;:&quot;key-features&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Further Reading&quot;,&quot;local&quot;:&quot;further-reading&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="inference-endpoints" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#inference-endpoints"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Inference Endpoints</span></h1> <div class="flex justify-center" data-svelte-h="svelte-119c1fc"><img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hf-endpoints/inference-endpoint-doc-thumbnail-light.png"> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hf-endpoints/inference-endpoint-doc-thumbnail-dark.png"></div> <p data-svelte-h="svelte-1b30m6h">Inference Endpoints is a managed service to deploy your AI model to production.
Here you’ll find quickstarts, guides, tutorials, use cases and a lot more.</p> <div class="grid grid-cols-1 md:grid-cols-2 gap-4" data-svelte-h="svelte-1oiez3r"><a class="!no-underline pb-8 pr-4 block rounded-xl border border-gray-200 dark:border-gray-800 bg-gradient-to-br from-blue-50 to-white dark:from-gray-900 dark:to-gray-800 hover:shadow-xl hover:-translate-y-1 transition-all leading-none flex flex-col h-full" href="./quick_start"><h3 class="font-semibold text-gray-900 dark:text-white mb-1 leading-none pt-4 mt-0 pl-4">🔥 Quickstart</h3> <p class="text-sm text-gray-600 dark:text-gray-400 leading-snug pl-4 flex-grow">Deploy a production ready AI model in minutes.</p></a> <a class="!no-underline pb-8 pr-4 block rounded-xl border border-gray-200 dark:border-gray-800 bg-gradient-to-br from-indigo-50 to-white dark:from-gray-900 dark:to-gray-800 hover:shadow-xl hover:-translate-y-1 transition-all leading-none flex flex-col h-full" href="./about"><h3 class="font-semibold text-gray-900 dark:text-white mb-1 leading-none pt-4 mt-0 pl-4">🔍 How Inference Endpoints Works</h3> <p class="text-sm text-gray-600 dark:text-gray-400 leading-snug pl-4 flex-grow">Understand the main components and benefits of Inference Endpoints.</p></a> <a class="!no-underline pb-8 pr-4 block rounded-xl border border-gray-200 dark:border-gray-800 bg-gradient-to-br from-red-50 to-white dark:from-gray-900 dark:to-gray-800 hover:shadow-xl hover:-translate-y-1 transition-all leading-none flex flex-col h-full" href="./guides/foundations"><h3 class="font-semibold text-gray-900 dark:text-white mb-1 leading-none pt-4 mt-0 pl-4">📖 Guides</h3> <p class="text-sm text-gray-600 dark:text-gray-400 leading-snug pl-4 flex-grow">Explore our guides to learn how to configure or enable specific features on the platform.</p></a> <a class="!no-underline pb-8 pr-4 block rounded-xl border border-gray-200 dark:border-gray-800 bg-gradient-to-br from-green-50 to-white dark:from-gray-900 dark:to-gray-800 hover:shadow-xl hover:-translate-y-1 transition-all leading-none flex flex-col h-full" href="./tutorials/chat_bot"><h3 class="font-semibold text-gray-900 dark:text-white mb-1 leading-none pt-4 mt-0 pl-4">🧑‍💻 Tutorials</h3> <p class="text-sm text-gray-600 dark:text-gray-400 leading-snug pl-4 flex-grow">Step-by-step guides on common developer scenarios.</p></a></div> <h2 class="relative group"><a id="why-use-inference-endpoints" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#why-use-inference-endpoints"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Why use Inference Endpoints</span></h2> <p data-svelte-h="svelte-1v6kydr">Inference Endpoints makes deploying AI models to production a smooth experience. Instead of spending weeks configuring infrastructure, managing servers, and debugging deployment issues, you can focus on what matters most: your model and your users.</p> <p data-svelte-h="svelte-dok2a3">Our platform eliminates the complexity of AI infrastructure while providing enterprise-grade features that scale with your business needs. Whether you’re a startup launching your first AI product or an enterprise team managing hundreds of models, Inference Endpoints provides the reliability, performance, and cost-efficiency you need.</p> <p data-svelte-h="svelte-53gvdg"><strong>Key benefits include:</strong></p> <ul data-svelte-h="svelte-1d31bvv"><li>⬇️ <strong>Reduce operational overhead</strong>: Eliminate the need for dedicated DevOps teams and infrastructure management, letting you focus on innovation.</li> <li>🚀 <strong>Scale with confidence</strong>: Handle traffic spikes automatically without worrying about capacity planning or performance degradation.</li> <li>⬇️ <strong>Lower total cost of ownership</strong>: Avoid the hidden costs of self-managed infrastructure including maintenance, monitoring, and security compliance.</li> <li>💻 <strong>Future-proof your AI stack</strong>: Stay current with the latest frameworks and optimizations without managing complex upgrades.</li> <li>🔥 <strong>Focus on what matters</strong>: Spend your time improving your models and building great user experiences, not managing servers.</li></ul> <h2 class="relative group"><a id="key-features" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#key-features"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Key Features</span></h2> <ul data-svelte-h="svelte-1fk0tap"><li>📦 <strong>Fully managed infrastructure</strong>: you don’t need to worry about things like kubernetes, CUDA versions and configuring VPNs. Inference Endpoints deals with this under the hood so you can focus on deploying your model and serving customers as fast as possible.</li> <li>↕️ <strong>Autoscaling</strong>: as there’s more traffic to your model you’ll need more firepower as well. Your Inference Endpoint scales up as traffic increases and down as it decreases to save you on unnecessary compute cost.</li> <li>👀 <strong>Observability</strong>: understand and debug what’s going on in your model through logs &amp; metrics.</li> <li>🔥 <strong>Integrated support for open-source serving framwworks</strong>: Whether you want to deploy your model with vLLM, TGI or a custom container, we got you!</li> <li>🤗 <strong>Seamless integration with the Hugging Face Hub</strong>: Downloading model weights fast and with the correct security policies is paramount when bringing an AI model to production. With Inference Endpoints, it’s easy and safe.</li></ul> <h2 class="relative group"><a id="further-reading" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#further-reading"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Further Reading</span></h2> <p data-svelte-h="svelte-esyxjo">If you’re considering using Inference Endpoints in production, read these two case studies:</p> <ul data-svelte-h="svelte-x18f32"><li><a href="https://huggingface.co/blog/mantis-case-study" rel="nofollow">Why we’re switching to Hugging Face Inference Endpoints, and maybe you should too</a></li> <li><a href="https://huggingface.co/blog/cfm-case-study" rel="nofollow">Investing in Performance: Fine-tune small models with LLM insights - a CFM case study</a></li></ul> <p data-svelte-h="svelte-19cwxs8">You might also find these blogs helpful:</p> <ul data-svelte-h="svelte-10v8d3o"><li><a href="https://huggingface.co/blog/alvarobartt/argilla-suggestions-via-inference-endpoints" rel="nofollow">🤗 LLM suggestions in Argilla with HuggingFace Inference Endpoints</a></li> <li><a href="https://www.philschmid.de/inference-endpoints-iac" rel="nofollow">Programmatically manage Inference Endpoints</a></li> <li><a href="https://huggingface.co/blog/multi-lora-serving" rel="nofollow">TGI Multi-LoRA: Deploy Once, Serve 30 models</a></li> <li><a href="https://huggingface.co/blog/llama31#hugging-face-inference-endpoints" rel="nofollow">Llama 3.1 - 405B, 70B &amp; 8B with multilinguality and long context</a></li> <li><a href="https://huggingface.co/blog/run-musicgen-as-an-api" rel="nofollow">Deploy MusicGen in no time with Inference Endpoints</a></li></ul> <p data-svelte-h="svelte-17dwlx5">Or try out the <a href="./quick_start">Quick Start</a>!</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/index.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1q0n26o = {
assets: "/docs/inference-endpoints/pr_136/en",
base: "/docs/inference-endpoints/pr_136/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/inference-endpoints/pr_136/en/_app/immutable/entry/start.fb9ab4d6.js"),
import("/docs/inference-endpoints/pr_136/en/_app/immutable/entry/app.6247727a.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 18],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
15.1 kB
·
Xet hash:
25728022f171efa21ff06b201324c4ef986e33cbce5949d0a5dc4f2149311291

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.