Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / inference-endpoints /pr_145 /en /_app /immutable /nodes /8.feed3dec.js

rtrm's picture

about 2 months ago

10.8 kB

	import{s as xe,o as be}from"../chunks/scheduler.a045fce0.js";import{S as ke,i as Ie,e as s,s as i,c as G,h as _e,a as l,d as n,b as a,f as we,g as H,j as u,k as ve,l as Le,m as o,n as q,t as S,o as z,p as B,r as Me,u as Ce}from"../chunks/index.c7f31426.js";import{T as Pe}from"../chunks/Tip.06e542ce.js";import{H as U,E as Ge}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.8ed610dd.js";function He(A){let r;return{c(){r=Me(`Note that there's a caveat: say you're deploying \`meta-llama/Llama-3.3-70B-Instruct\`, which has a context length of 128k tokens.
	But you're on a GPU where you can only fit the model's context three times in memory. So if you want to serve the model with full context length,
	you can only serve up to 3 concurrent requests. In some cases, it's fine to drop the maximum context length to 64k tokens, which would
	allow the server to process 6 concurrent requests.
	You can configure this by setting max input length to 64k and then let TGI auto-configure the rest.`)},l(c){r=Ce(c,`Note that there's a caveat: say you're deploying \`meta-llama/Llama-3.3-70B-Instruct\`, which has a context length of 128k tokens.
	But you're on a GPU where you can only fit the model's context three times in memory. So if you want to serve the model with full context length,
	you can only serve up to 3 concurrent requests. In some cases, it's fine to drop the maximum context length to 64k tokens, which would
	allow the server to process 6 concurrent requests.
	You can configure this by setting max input length to 64k and then let TGI auto-configure the rest.`)},m(c,f){o(c,r,f)},d(c){c&&n(r)}}}function qe(A){let r,c,f,O,h,N,p,le=`TGI is a production-grade inference engine built in Rust and Python, designed for high-performance
	serving of open-source LLMs (e.g. LLaMA, Falcon, StarCoder, BLOOM and many more).
	The core features that make TGI a good choice are:`,R,g,re="<li><strong>Continuous batching + streaming</strong>: Dynamically groups in-flight requests and streams tokens via Server-Sent Events (SSE)</li> <li><strong>Optimized attention & decoding</strong>: TGI uses Flash Attention, Paged Attention, KV-caching, and custom CUDA kernels for latency and memory efficiency</li> <li><strong>Quantization & weight loading speed</strong>: Supports quantizations methods like bitsandbytes and GPTQ and uses Safetensors to reduce load times</li> <li><strong>Production readiness</strong>: Fully OpenAI-compatible <code>/v1/chat</code> or <code>/v1/completions</code> APIs, Prometheus metrics, OpenTelemetry tracing, watermarking, logit controls, JSON schema guidance</li>",F,d,ue=`By default, the TGI version will be the latest available one (with some delay). But you can also specify a different version by <a href="https://raw.githubusercontent.com/not-here" rel="nofollow">changing
	the container URL</a>`,j,y,Q,$,ce=`When selecting a model to deploy, the Inference Endpoints UI automatically checks whether a model is supported by TGI. If it is, you’ll see
	the option presented under <code>Container Configuration</code> where you can change the following settings:`,W,T,me='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/tgi/tgi_config.png" alt="config"/>',Y,w,fe=`<li><strong>Quantization</strong>: Which quantization method, if any, to use for the model.</li> <li><strong>Max Number of Tokens (per query)</strong>: Changes the maximum amount of tokens a request can contain.
	For example a value of <code>1512</code> means users can send either a prompt of <code>1000</code> tokens and generate <code>512</code> new tokens,
	or send a prompt of <code>1</code> token and generate <code>1511</code> new tokens. The larger this value, the larger amount each request
	will be in your RAM and the less effective batching can be.</li> <li><strong>Max Input Tokens (per query)</strong>: The maximum number of input tokens, meaning the amount of tokens in the prompt.</li> <li><strong>Max Batch Prefill Tokens</strong>: Limits the number of tokens for the prefill operation. Prefill tokens are the ones sent in with the user prompt.</li> <li><strong>Max Batch Total Tokens</strong>: This changes the total amount of potential tokens within a batch. Together with <code>Max Number of Tokens</code>,
	this determines how many concurrent requests you can serve. If you set <code>Max Number of Tokens</code> to 100 and <code>Max Batch Total Tokens</code> to 100 as well,
	you can only serve one request at a time.</li>`,D,v,he=`In general zero-configuration (see below) is recommended for most cases. TGI supports several other configuration parameters and you’ll find a complete list
	in the <a href="https://huggingface.co/docs/text-generation-inference/reference/launcher#text-generation-launcher-arguments" rel="nofollow">TGI documentation</a>. These can all be
	set by passing the values as environment variables to the container, <a href="https://huggingface.co/no-link-yet" rel="nofollow">link to guide</a>.`,Z,x,J,b,pe=`Introduced in TGI v3, the zero-config mode helps you get the most out of your hardware without manual configuration and trial & error.
	If you leave the values undefined, TGI will on server startup automatically (based on the hardware it’s running on) select the maximal possible values
	for the max input lenght, max number of tokens, max batch prefill tokens and max batch total tokens. This means that you’ll use your hardware to it’s full capacity.`,K,m,V,k,X,I,ge="You can find the models that are supported by TGI:",ee,_,de='<li>Browse supported models on the <a href="https://huggingface.co/models?apps=tgi&sort=trending" rel="nofollow">Hugging Face Hub</a></li> <li>In the TGI documentation under the <a href="https://huggingface.co/docs/text-generation-inference/supported_models" rel="nofollow">supported models</a> section</li> <li>A selection of popular models in the <a href="https://endpoints.huggingface.co/huggingface/catalog" rel="nofollow">Inference Endpoints Catalog</a></li>',te,L,ye=`If a model is supported by TGI, the Inference Endpoints UI will indicate this by disabling/enabling the selection under <code>Container Type</code> configuration.
	<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/tgi/tgi_selection.png" alt="selection"/>`,ne,M,oe,C,$e='We also recommend reading the <a href="https://huggingface.co/docs/text-generation-inference" rel="nofollow">TGI documentation</a> for more in-depth information.',ie,P,ae,E,se;return h=new U({props:{title:"Text Generation Inference (TGI)",local:"text-generation-inference-tgi",headingTag:"h1"}}),y=new U({props:{title:"Configuration",local:"configuration",headingTag:"h2"}}),x=new U({props:{title:"Zero configuration",local:"zero-configuration",headingTag:"h2"}}),m=new Pe({props:{$$slots:{default:[He]},$$scope:{ctx:A}}}),k=new U({props:{title:"Supported models",local:"supported-models",headingTag:"h2"}}),M=new U({props:{title:"References",local:"references",headingTag:"h2"}}),P=new Ge({props:{source:"https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/engines/tgi.md"}}),{c(){r=s("meta"),c=i(),f=s("p"),O=i(),G(h.$$.fragment),N=i(),p=s("p"),p.textContent=le,R=i(),g=s("ul"),g.innerHTML=re,F=i(),d=s("p"),d.innerHTML=ue,j=i(),G(y.$$.fragment),Q=i(),$=s("p"),$.innerHTML=ce,W=i(),T=s("p"),T.innerHTML=me,Y=i(),w=s("ul"),w.innerHTML=fe,D=i(),v=s("p"),v.innerHTML=he,Z=i(),G(x.$$.fragment),J=i(),b=s("p"),b.textContent=pe,K=i(),G(m.$$.fragment),V=i(),G(k.$$.fragment),X=i(),I=s("p"),I.textContent=ge,ee=i(),_=s("ul"),_.innerHTML=de,te=i(),L=s("p"),L.innerHTML=ye,ne=i(),G(M.$$.fragment),oe=i(),C=s("p"),C.innerHTML=$e,ie=i(),G(P.$$.fragment),ae=i(),E=s("p"),this.h()},l(e){const t=_e("svelte-u9bgzb",document.head);r=l(t,"META",{name:!0,content:!0}),t.forEach(n),c=a(e),f=l(e,"P",{}),we(f).forEach(n),O=a(e),H(h.$$.fragment,e),N=a(e),p=l(e,"P",{"data-svelte-h":!0}),u(p)!=="svelte-1j6m1nq"&&(p.textContent=le),R=a(e),g=l(e,"UL",{"data-svelte-h":!0}),u(g)!=="svelte-lswtjz"&&(g.innerHTML=re),F=a(e),d=l(e,"P",{"data-svelte-h":!0}),u(d)!=="svelte-159wehp"&&(d.innerHTML=ue),j=a(e),H(y.$$.fragment,e),Q=a(e),$=l(e,"P",{"data-svelte-h":!0}),u($)!=="svelte-8ibp2c"&&($.innerHTML=ce),W=a(e),T=l(e,"P",{"data-svelte-h":!0}),u(T)!=="svelte-12tkryh"&&(T.innerHTML=me),Y=a(e),w=l(e,"UL",{"data-svelte-h":!0}),u(w)!=="svelte-1vkh47x"&&(w.innerHTML=fe),D=a(e),v=l(e,"P",{"data-svelte-h":!0}),u(v)!=="svelte-1b37qua"&&(v.innerHTML=he),Z=a(e),H(x.$$.fragment,e),J=a(e),b=l(e,"P",{"data-svelte-h":!0}),u(b)!=="svelte-1d6c15z"&&(b.textContent=pe),K=a(e),H(m.$$.fragment,e),V=a(e),H(k.$$.fragment,e),X=a(e),I=l(e,"P",{"data-svelte-h":!0}),u(I)!=="svelte-ta9nlr"&&(I.textContent=ge),ee=a(e),_=l(e,"UL",{"data-svelte-h":!0}),u(_)!=="svelte-9499tw"&&(_.innerHTML=de),te=a(e),L=l(e,"P",{"data-svelte-h":!0}),u(L)!=="svelte-1qn69p6"&&(L.innerHTML=ye),ne=a(e),H(M.$$.fragment,e),oe=a(e),C=l(e,"P",{"data-svelte-h":!0}),u(C)!=="svelte-zwya3k"&&(C.innerHTML=$e),ie=a(e),H(P.$$.fragment,e),ae=a(e),E=l(e,"P",{}),we(E).forEach(n),this.h()},h(){ve(r,"name","hf:doc:metadata"),ve(r,"content",Se)},m(e,t){Le(document.head,r),o(e,c,t),o(e,f,t),o(e,O,t),q(h,e,t),o(e,N,t),o(e,p,t),o(e,R,t),o(e,g,t),o(e,F,t),o(e,d,t),o(e,j,t),q(y,e,t),o(e,Q,t),o(e,$,t),o(e,W,t),o(e,T,t),o(e,Y,t),o(e,w,t),o(e,D,t),o(e,v,t),o(e,Z,t),q(x,e,t),o(e,J,t),o(e,b,t),o(e,K,t),q(m,e,t),o(e,V,t),q(k,e,t),o(e,X,t),o(e,I,t),o(e,ee,t),o(e,_,t),o(e,te,t),o(e,L,t),o(e,ne,t),q(M,e,t),o(e,oe,t),o(e,C,t),o(e,ie,t),q(P,e,t),o(e,ae,t),o(e,E,t),se=!0},p(e,[t]){const Te={};t&2&&(Te.$$scope={dirty:t,ctx:e}),m.$set(Te)},i(e){se\|\|(S(h.$$.fragment,e),S(y.$$.fragment,e),S(x.$$.fragment,e),S(m.$$.fragment,e),S(k.$$.fragment,e),S(M.$$.fragment,e),S(P.$$.fragment,e),se=!0)},o(e){z(h.$$.fragment,e),z(y.$$.fragment,e),z(x.$$.fragment,e),z(m.$$.fragment,e),z(k.$$.fragment,e),z(M.$$.fragment,e),z(P.$$.fragment,e),se=!1},d(e){e&&(n(c),n(f),n(O),n(N),n(p),n(R),n(g),n(F),n(d),n(j),n(Q),n($),n(W),n(T),n(Y),n(w),n(D),n(v),n(Z),n(J),n(b),n(K),n(V),n(X),n(I),n(ee),n(_),n(te),n(L),n(ne),n(oe),n(C),n(ie),n(ae),n(E)),n(r),B(h,e),B(y,e),B(x,e),B(m,e),B(k,e),B(M,e),B(P,e)}}}const Se='{"title":"Text Generation Inference (TGI)","local":"text-generation-inference-tgi","sections":[{"title":"Configuration","local":"configuration","sections":[],"depth":2},{"title":"Zero configuration","local":"zero-configuration","sections":[],"depth":2},{"title":"Supported models","local":"supported-models","sections":[],"depth":2},{"title":"References","local":"references","sections":[],"depth":2}],"depth":1}';function ze(A){return be(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Oe extends ke{constructor(r){super(),Ie(this,r,ze,qe,xe,{})}}export{Oe as component};

Xet Storage Details

Size:: 10.8 kB
Xet hash:: 080b161b40e04af8f163e8169edee30c273cb80e31372b4ea6e962dd4840d1f6

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.