Buckets:
| import{s as xe,o as be}from"../chunks/scheduler.a045fce0.js";import{S as ke,i as Ie,e as s,s as i,c as G,h as _e,a as l,d as n,b as a,f as we,g as H,j as u,k as ve,l as Le,m as o,n as q,t as S,o as z,p as B,r as Me,u as Ce}from"../chunks/index.c7f31426.js";import{T as Pe}from"../chunks/Tip.06e542ce.js";import{H as U,E as Ge}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.8ed610dd.js";function He(A){let r;return{c(){r=Me(`Note that there's a caveat: say you're deploying \`meta-llama/Llama-3.3-70B-Instruct\`, which has a context length of 128k tokens. | |
| But you're on a GPU where you can only fit the model's context three times in memory. So if you want to serve the model with full context length, | |
| you can only serve up to 3 concurrent requests. In some cases, it's fine to drop the maximum context length to 64k tokens, which would | |
| allow the server to process 6 concurrent requests. | |
| You can configure this by setting max input length to 64k and then let TGI auto-configure the rest.`)},l(c){r=Ce(c,`Note that there's a caveat: say you're deploying \`meta-llama/Llama-3.3-70B-Instruct\`, which has a context length of 128k tokens. | |
| But you're on a GPU where you can only fit the model's context three times in memory. So if you want to serve the model with full context length, | |
| you can only serve up to 3 concurrent requests. In some cases, it's fine to drop the maximum context length to 64k tokens, which would | |
| allow the server to process 6 concurrent requests. | |
| You can configure this by setting max input length to 64k and then let TGI auto-configure the rest.`)},m(c,f){o(c,r,f)},d(c){c&&n(r)}}}function qe(A){let r,c,f,O,h,N,p,le=`TGI is a production-grade inference engine built in Rust and Python, designed for high-performance | |
| serving of open-source LLMs (e.g. LLaMA, Falcon, StarCoder, BLOOM and many more). | |
| The core features that make TGI a good choice are:`,R,g,re="<li><strong>Continuous batching + streaming</strong>: Dynamically groups in-flight requests and streams tokens via Server-Sent Events (SSE)</li> <li><strong>Optimized attention & decoding</strong>: TGI uses Flash Attention, Paged Attention, KV-caching, and custom CUDA kernels for latency and memory efficiency</li> <li><strong>Quantization & weight loading speed</strong>: Supports quantizations methods like bitsandbytes and GPTQ and uses Safetensors to reduce load times</li> <li><strong>Production readiness</strong>: Fully OpenAI-compatible <code>/v1/chat</code> or <code>/v1/completions</code> APIs, Prometheus metrics, OpenTelemetry tracing, watermarking, logit controls, JSON schema guidance</li>",F,d,ue=`By default, the TGI version will be the latest available one (with some delay). But you can also specify a different version by <a href="https://raw.githubusercontent.com/not-here" rel="nofollow">changing | |
| the container URL</a>`,j,y,Q,$,ce=`When selecting a model to deploy, the Inference Endpoints UI automatically checks whether a model is supported by TGI. If it is, you’ll see | |
| the option presented under <code>Container Configuration</code> where you can change the following settings:`,W,T,me='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/tgi/tgi_config.png" alt="config"/>',Y,w,fe=`<li><strong>Quantization</strong>: Which quantization method, if any, to use for the model.</li> <li><strong>Max Number of Tokens (per query)</strong>: Changes the maximum amount of tokens a request can contain. | |
| For example a value of <code>1512</code> means users can send either a prompt of <code>1000</code> tokens and generate <code>512</code> new tokens, | |
| or send a prompt of <code>1</code> token and generate <code>1511</code> new tokens. The larger this value, the larger amount each request | |
| will be in your RAM and the less effective batching can be.</li> <li><strong>Max Input Tokens (per query)</strong>: The maximum number of input tokens, meaning the amount of tokens in the prompt.</li> <li><strong>Max Batch Prefill Tokens</strong>: Limits the number of tokens for the prefill operation. Prefill tokens are the ones sent in with the user prompt.</li> <li><strong>Max Batch Total Tokens</strong>: This changes the total amount of potential tokens within a batch. Together with <code>Max Number of Tokens</code>, | |
| this determines how many concurrent requests you can serve. If you set <code>Max Number of Tokens</code> to 100 and <code>Max Batch Total Tokens</code> to 100 as well, | |
| you can only serve one request at a time.</li>`,D,v,he=`In general zero-configuration (see below) is recommended for most cases. TGI supports several other configuration parameters and you’ll find a complete list | |
| in the <a href="https://huggingface.co/docs/text-generation-inference/reference/launcher#text-generation-launcher-arguments" rel="nofollow">TGI documentation</a>. These can all be | |
| set by passing the values as environment variables to the container, <a href="https://huggingface.co/no-link-yet" rel="nofollow">link to guide</a>.`,Z,x,J,b,pe=`Introduced in TGI v3, the zero-config mode helps you get the most out of your hardware without manual configuration and trial & error. | |
| If you leave the values undefined, TGI will on server startup automatically (based on the hardware it’s running on) select the maximal possible values | |
| for the max input lenght, max number of tokens, max batch prefill tokens and max batch total tokens. This means that you’ll use your hardware to it’s full capacity.`,K,m,V,k,X,I,ge="You can find the models that are supported by TGI:",ee,_,de='<li>Browse supported models on the <a href="https://huggingface.co/models?apps=tgi&sort=trending" rel="nofollow">Hugging Face Hub</a></li> <li>In the TGI documentation under the <a href="https://huggingface.co/docs/text-generation-inference/supported_models" rel="nofollow">supported models</a> section</li> <li>A selection of popular models in the <a href="https://endpoints.huggingface.co/huggingface/catalog" rel="nofollow">Inference Endpoints Catalog</a></li>',te,L,ye=`If a model is supported by TGI, the Inference Endpoints UI will indicate this by disabling/enabling the selection under <code>Container Type</code> configuration. | |
| <img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/tgi/tgi_selection.png" alt="selection"/>`,ne,M,oe,C,$e='We also recommend reading the <a href="https://huggingface.co/docs/text-generation-inference" rel="nofollow">TGI documentation</a> for more in-depth information.',ie,P,ae,E,se;return h=new U({props:{title:"Text Generation Inference (TGI)",local:"text-generation-inference-tgi",headingTag:"h1"}}),y=new U({props:{title:"Configuration",local:"configuration",headingTag:"h2"}}),x=new U({props:{title:"Zero configuration",local:"zero-configuration",headingTag:"h2"}}),m=new Pe({props:{$$slots:{default:[He]},$$scope:{ctx:A}}}),k=new U({props:{title:"Supported models",local:"supported-models",headingTag:"h2"}}),M=new U({props:{title:"References",local:"references",headingTag:"h2"}}),P=new Ge({props:{source:"https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/engines/tgi.md"}}),{c(){r=s("meta"),c=i(),f=s("p"),O=i(),G(h.$$.fragment),N=i(),p=s("p"),p.textContent=le,R=i(),g=s("ul"),g.innerHTML=re,F=i(),d=s("p"),d.innerHTML=ue,j=i(),G(y.$$.fragment),Q=i(),$=s("p"),$.innerHTML=ce,W=i(),T=s("p"),T.innerHTML=me,Y=i(),w=s("ul"),w.innerHTML=fe,D=i(),v=s("p"),v.innerHTML=he,Z=i(),G(x.$$.fragment),J=i(),b=s("p"),b.textContent=pe,K=i(),G(m.$$.fragment),V=i(),G(k.$$.fragment),X=i(),I=s("p"),I.textContent=ge,ee=i(),_=s("ul"),_.innerHTML=de,te=i(),L=s("p"),L.innerHTML=ye,ne=i(),G(M.$$.fragment),oe=i(),C=s("p"),C.innerHTML=$e,ie=i(),G(P.$$.fragment),ae=i(),E=s("p"),this.h()},l(e){const t=_e("svelte-u9bgzb",document.head);r=l(t,"META",{name:!0,content:!0}),t.forEach(n),c=a(e),f=l(e,"P",{}),we(f).forEach(n),O=a(e),H(h.$$.fragment,e),N=a(e),p=l(e,"P",{"data-svelte-h":!0}),u(p)!=="svelte-1j6m1nq"&&(p.textContent=le),R=a(e),g=l(e,"UL",{"data-svelte-h":!0}),u(g)!=="svelte-lswtjz"&&(g.innerHTML=re),F=a(e),d=l(e,"P",{"data-svelte-h":!0}),u(d)!=="svelte-159wehp"&&(d.innerHTML=ue),j=a(e),H(y.$$.fragment,e),Q=a(e),$=l(e,"P",{"data-svelte-h":!0}),u($)!=="svelte-8ibp2c"&&($.innerHTML=ce),W=a(e),T=l(e,"P",{"data-svelte-h":!0}),u(T)!=="svelte-12tkryh"&&(T.innerHTML=me),Y=a(e),w=l(e,"UL",{"data-svelte-h":!0}),u(w)!=="svelte-1vkh47x"&&(w.innerHTML=fe),D=a(e),v=l(e,"P",{"data-svelte-h":!0}),u(v)!=="svelte-1b37qua"&&(v.innerHTML=he),Z=a(e),H(x.$$.fragment,e),J=a(e),b=l(e,"P",{"data-svelte-h":!0}),u(b)!=="svelte-1d6c15z"&&(b.textContent=pe),K=a(e),H(m.$$.fragment,e),V=a(e),H(k.$$.fragment,e),X=a(e),I=l(e,"P",{"data-svelte-h":!0}),u(I)!=="svelte-ta9nlr"&&(I.textContent=ge),ee=a(e),_=l(e,"UL",{"data-svelte-h":!0}),u(_)!=="svelte-9499tw"&&(_.innerHTML=de),te=a(e),L=l(e,"P",{"data-svelte-h":!0}),u(L)!=="svelte-1qn69p6"&&(L.innerHTML=ye),ne=a(e),H(M.$$.fragment,e),oe=a(e),C=l(e,"P",{"data-svelte-h":!0}),u(C)!=="svelte-zwya3k"&&(C.innerHTML=$e),ie=a(e),H(P.$$.fragment,e),ae=a(e),E=l(e,"P",{}),we(E).forEach(n),this.h()},h(){ve(r,"name","hf:doc:metadata"),ve(r,"content",Se)},m(e,t){Le(document.head,r),o(e,c,t),o(e,f,t),o(e,O,t),q(h,e,t),o(e,N,t),o(e,p,t),o(e,R,t),o(e,g,t),o(e,F,t),o(e,d,t),o(e,j,t),q(y,e,t),o(e,Q,t),o(e,$,t),o(e,W,t),o(e,T,t),o(e,Y,t),o(e,w,t),o(e,D,t),o(e,v,t),o(e,Z,t),q(x,e,t),o(e,J,t),o(e,b,t),o(e,K,t),q(m,e,t),o(e,V,t),q(k,e,t),o(e,X,t),o(e,I,t),o(e,ee,t),o(e,_,t),o(e,te,t),o(e,L,t),o(e,ne,t),q(M,e,t),o(e,oe,t),o(e,C,t),o(e,ie,t),q(P,e,t),o(e,ae,t),o(e,E,t),se=!0},p(e,[t]){const Te={};t&2&&(Te.$$scope={dirty:t,ctx:e}),m.$set(Te)},i(e){se||(S(h.$$.fragment,e),S(y.$$.fragment,e),S(x.$$.fragment,e),S(m.$$.fragment,e),S(k.$$.fragment,e),S(M.$$.fragment,e),S(P.$$.fragment,e),se=!0)},o(e){z(h.$$.fragment,e),z(y.$$.fragment,e),z(x.$$.fragment,e),z(m.$$.fragment,e),z(k.$$.fragment,e),z(M.$$.fragment,e),z(P.$$.fragment,e),se=!1},d(e){e&&(n(c),n(f),n(O),n(N),n(p),n(R),n(g),n(F),n(d),n(j),n(Q),n($),n(W),n(T),n(Y),n(w),n(D),n(v),n(Z),n(J),n(b),n(K),n(V),n(X),n(I),n(ee),n(_),n(te),n(L),n(ne),n(oe),n(C),n(ie),n(ae),n(E)),n(r),B(h,e),B(y,e),B(x,e),B(m,e),B(k,e),B(M,e),B(P,e)}}}const Se='{"title":"Text Generation Inference (TGI)","local":"text-generation-inference-tgi","sections":[{"title":"Configuration","local":"configuration","sections":[],"depth":2},{"title":"Zero configuration","local":"zero-configuration","sections":[],"depth":2},{"title":"Supported models","local":"supported-models","sections":[],"depth":2},{"title":"References","local":"references","sections":[],"depth":2}],"depth":1}';function ze(A){return be(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Oe extends ke{constructor(r){super(),Ie(this,r,ze,qe,xe,{})}}export{Oe as component}; | |
Xet Storage Details
- Size:
- 10.8 kB
- Xet hash:
- 080b161b40e04af8f163e8169edee30c273cb80e31372b4ea6e962dd4840d1f6
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.