Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / inference-endpoints /pr_136 /en /engines /tgi.html

rtrm

about 2 months ago

download

raw

15.4 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Text Generation Inference (TGI)","local":"text-generation-inference-tgi","sections":[{"title":"Configuration","local":"configuration","sections":[],"depth":2},{"title":"Zero configuration","local":"zero-configuration","sections":[],"depth":2},{"title":"Supported models","local":"supported-models","sections":[],"depth":2},{"title":"References","local":"references","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/inference-endpoints/pr_136/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/entry/start.fb9ab4d6.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/scheduler.f6b352c8.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/singletons.ceca4163.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/index.26cf6c5a.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/paths.142cd5df.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/entry/app.6247727a.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/index.b90df637.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/nodes/0.2fcde12d.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/nodes/8.d50d836c.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/Tip.366d2e6e.js">
	<link rel="modulepreload" href="/docs/inference-endpoints/pr_136/en/_app/immutable/chunks/getInferenceSnippets.1e3ae0bf.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Text Generation Inference (TGI)","local":"text-generation-inference-tgi","sections":[{"title":"Configuration","local":"configuration","sections":[],"depth":2},{"title":"Zero configuration","local":"zero-configuration","sections":[],"depth":2},{"title":"Supported models","local":"supported-models","sections":[],"depth":2},{"title":"References","local":"references","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="text-generation-inference-tgi" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text-generation-inference-tgi"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text Generation Inference (TGI)</span></h1> <p data-svelte-h="svelte-1j6m1nq">TGI is a production-grade inference engine built in Rust and Python, designed for high-performance
	serving of open-source LLMs (e.g. LLaMA, Falcon, StarCoder, BLOOM and many more).
	The core features that make TGI a good choice are:</p> <ul data-svelte-h="svelte-lswtjz"><li><strong>Continuous batching + streaming</strong>: Dynamically groups in-flight requests and streams tokens via Server-Sent Events (SSE)</li> <li><strong>Optimized attention & decoding</strong>: TGI uses Flash Attention, Paged Attention, KV-caching, and custom CUDA kernels for latency and memory efficiency</li> <li><strong>Quantization & weight loading speed</strong>: Supports quantizations methods like bitsandbytes and GPTQ and uses Safetensors to reduce load times</li> <li><strong>Production readiness</strong>: Fully OpenAI-compatible <code>/v1/chat</code> or <code>/v1/completions</code> APIs, Prometheus metrics, OpenTelemetry tracing, watermarking, logit controls, JSON schema guidance</li></ul> <p data-svelte-h="svelte-159wehp">By default, the TGI version will be the latest available one (with some delay). But you can also specify a different version by <a href="https://raw.githubusercontent.com/not-here" rel="nofollow">changing
	the container URL</a></p> <h2 class="relative group"><a id="configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Configuration</span></h2> <p data-svelte-h="svelte-8ibp2c">When selecting a model to deploy, the Inference Endpoints UI automatically checks whether a model is supported by TGI. If it is, you’ll see
	the option presented under <code>Container Configuration</code> where you can change the following settings:</p> <p data-svelte-h="svelte-12tkryh"><img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/tgi/tgi_config.png" alt="config"></p> <ul data-svelte-h="svelte-1vkh47x"><li><strong>Quantization</strong>: Which quantization method, if any, to use for the model.</li> <li><strong>Max Number of Tokens (per query)</strong>: Changes the maximum amount of tokens a request can contain.
	For example a value of <code>1512</code> means users can send either a prompt of <code>1000</code> tokens and generate <code>512</code> new tokens,
	or send a prompt of <code>1</code> token and generate <code>1511</code> new tokens. The larger this value, the larger amount each request
	will be in your RAM and the less effective batching can be.</li> <li><strong>Max Input Tokens (per query)</strong>: The maximum number of input tokens, meaning the amount of tokens in the prompt.</li> <li><strong>Max Batch Prefill Tokens</strong>: Limits the number of tokens for the prefill operation. Prefill tokens are the ones sent in with the user prompt.</li> <li><strong>Max Batch Total Tokens</strong>: This changes the total amount of potential tokens within a batch. Together with <code>Max Number of Tokens</code>,
	this determines how many concurrent requests you can serve. If you set <code>Max Number of Tokens</code> to 100 and <code>Max Batch Total Tokens</code> to 100 as well,
	you can only serve one request at a time.</li></ul> <p data-svelte-h="svelte-1b37qua">In general zero-configuration (see below) is recommended for most cases. TGI supports several other configuration parameters and you’ll find a complete list
	in the <a href="https://huggingface.co/docs/text-generation-inference/reference/launcher#text-generation-launcher-arguments" rel="nofollow">TGI documentation</a>. These can all be
	set by passing the values as environment variables to the container, <a href="https://huggingface.co/no-link-yet" rel="nofollow">link to guide</a>.</p> <h2 class="relative group"><a id="zero-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#zero-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Zero configuration</span></h2> <p data-svelte-h="svelte-1d6c15z">Introduced in TGI v3, the zero-config mode helps you get the most out of your hardware without manual configuration and trial & error.
	If you leave the values undefined, TGI will on server startup automatically (based on the hardware it’s running on) select the maximal possible values
	for the max input lenght, max number of tokens, max batch prefill tokens and max batch total tokens. This means that you’ll use your hardware to it’s full capacity.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">Note that there's a caveat: say you're deploying `meta-llama/Llama-3.3-70B-Instruct`, which has a context length of 128k tokens.
	But you're on a GPU where you can only fit the model's context three times in memory. So if you want to serve the model with full context length,
	you can only serve up to 3 concurrent requests. In some cases, it's fine to drop the maximum context length to 64k tokens, which would
	allow the server to process 6 concurrent requests.
	You can configure this by setting max input length to 64k and then let TGI auto-configure the rest.</div> <h2 class="relative group"><a id="supported-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#supported-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Supported models</span></h2> <p data-svelte-h="svelte-ta9nlr">You can find the models that are supported by TGI:</p> <ul data-svelte-h="svelte-9499tw"><li>Browse supported models on the <a href="https://huggingface.co/models?apps=tgi&sort=trending" rel="nofollow">Hugging Face Hub</a></li> <li>In the TGI documentation under the <a href="https://huggingface.co/docs/text-generation-inference/supported_models" rel="nofollow">supported models</a> section</li> <li>A selection of popular models in the <a href="https://endpoints.huggingface.co/huggingface/catalog" rel="nofollow">Inference Endpoints Catalog</a></li></ul> <p data-svelte-h="svelte-1qn69p6">If a model is supported by TGI, the Inference Endpoints UI will indicate this by disabling/enabling the selection under <code>Container Type</code> configuration.
	<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/tgi/tgi_selection.png" alt="selection"></p> <h2 class="relative group"><a id="references" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#references"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>References</span></h2> <p data-svelte-h="svelte-zwya3k">We also recommend reading the <a href="https://huggingface.co/docs/text-generation-inference" rel="nofollow">TGI documentation</a> for more in-depth information.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/engines/tgi.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1q0n26o = {
	assets: "/docs/inference-endpoints/pr_136/en",
	base: "/docs/inference-endpoints/pr_136/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/inference-endpoints/pr_136/en/_app/immutable/entry/start.fb9ab4d6.js"),
	import("/docs/inference-endpoints/pr_136/en/_app/immutable/entry/app.6247727a.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 8],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 15.4 kB
Xet hash:: 5ca68d21e7f767d9e0f781cdde156a271117a73170cabbcc6bd37fbc211fdec1

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.