Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / inference-endpoints /pr_152 /en /_app /immutable /nodes /10.bc8a80c2.js

rtrm's picture

about 2 months ago

7.14 kB

	import{s as de,n as ce,o as ge}from"../chunks/scheduler.eb244325.js";import{S as he,i as ve,e as s,s as i,c as w,h as $e,a as o,d as n,b as a,f as pe,g as C,j as r,k as fe,l as be,m as l,n as P,t as H,o as k,p as S}from"../chunks/index.661680a1.js";import{C as Le,H as Z,E as Me}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.83ba2125.js";function _e(ee){let m,E,q,U,u,z,p,G,f,te=`vLLM is a high-performance, memory-efficient inference engine for open-source LLMs. It delivers efficient scheduling, KV-cache handling,
	batching, and decoding—all wrapped in a production-ready server. For most use cases, TGI, vLLM, and SGLang will be equivalently good options.`,I,d,ne="<strong>Core features</strong>:",N,c,le="<li><strong>PagedAttention for memory efficiency</strong></li> <li><strong>Continuous batching</strong></li> <li><strong>Optimized CUDA/HIP execution</strong></li> <li><strong>Speculative decoding & chunked prefill</strong></li> <li><strong>Multi-backend and hardware support</strong>: Runs across NVIDIA, AMD, and AWS Neuron to name a few</li>",D,g,F,h,ie='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/vllm/vllm_config.png" alt="config"/>',R,v,ae=`<li><strong>Max Number of Sequences</strong>: The maximum number of sequences (requests) that can be processed together in a single batch. Controls
	the batch size by sequence count, affecting throughput and memory usage. For example, if max_num_seqs=8, up to 8 different prompts can
	be handled at once, regardless of their individual lengths, as long as the total token count also fits within the Max Number of Batched Tokens.</li> <li><strong>Max Number of Batched Tokens</strong>: The maximum total number of tokens (summed across all sequences) that can be processed in a single
	batch. Limits batch size by token count, balancing throughput and GPU memory allocation.</li> <li><strong>Tensor Parallel Size</strong>: The number of GPUs across which model weights are split within each layer. Increasing this allows larger
	models to run and frees up GPU memory for KV cache, but may introduce synchronization overhead.</li> <li><strong>KV Cache DType</strong>: the data type used for storing the key-value cache during generation. Options include “auto”, “fp8”, “fp8_e5m2”,
	and “fp8_e4m3”. Using lower precision types can reduce memory usage but may slightly impact generation quality.</li>`,V,$,se=`For more advanced configuration you can pass any of the <a href="https://docs.vllm.ai/en/stable/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs" rel="nofollow">Engine Arguments that vLLM supports</a>
	as container arguments. For example changing the <code>enable_lora</code> to <code>true</code> would look like this:`,j,b,oe='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/vllm/vllm-advanced.png" alt="vllm-advanced"/>',K,L,O,M,re=`vLLM has wide support for large language models and embedding models. We recommend reading the
	<a href="https://docs.vllm.ai/en/stable/models/supported_models.html?h=supported+models" rel="nofollow">supported models</a> section in the vLLM documentation for a full list.`,W,_,me=`vLLM also supports model implementations that are available in Transformers. Currently not all models work but support is planned for most
	decoder language models are supported, and vision language models.`,B,x,J,y,ue='We also recommend reading the <a href="https://docs.vllm.ai/en/stable/" rel="nofollow">vLLM documentation</a> for more in-depth information.',Q,T,X,A,Y;return u=new Le({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),p=new Z({props:{title:"vLLM",local:"vllm",headingTag:"h1"}}),g=new Z({props:{title:"Configuration",local:"configuration",headingTag:"h2"}}),L=new Z({props:{title:"Supported models",local:"supported-models",headingTag:"h2"}}),x=new Z({props:{title:"References",local:"references",headingTag:"h2"}}),T=new Me({props:{source:"https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/engines/vllm.md"}}),{c(){m=s("meta"),E=i(),q=s("p"),U=i(),w(u.$$.fragment),z=i(),w(p.$$.fragment),G=i(),f=s("p"),f.textContent=te,I=i(),d=s("p"),d.innerHTML=ne,N=i(),c=s("ul"),c.innerHTML=le,D=i(),w(g.$$.fragment),F=i(),h=s("p"),h.innerHTML=ie,R=i(),v=s("ul"),v.innerHTML=ae,V=i(),$=s("p"),$.innerHTML=se,j=i(),b=s("p"),b.innerHTML=oe,K=i(),w(L.$$.fragment),O=i(),M=s("p"),M.innerHTML=re,W=i(),_=s("p"),_.textContent=me,B=i(),w(x.$$.fragment),J=i(),y=s("p"),y.innerHTML=ue,Q=i(),w(T.$$.fragment),X=i(),A=s("p"),this.h()},l(e){const t=$e("svelte-u9bgzb",document.head);m=o(t,"META",{name:!0,content:!0}),t.forEach(n),E=a(e),q=o(e,"P",{}),pe(q).forEach(n),U=a(e),C(u.$$.fragment,e),z=a(e),C(p.$$.fragment,e),G=a(e),f=o(e,"P",{"data-svelte-h":!0}),r(f)!=="svelte-3mges0"&&(f.textContent=te),I=a(e),d=o(e,"P",{"data-svelte-h":!0}),r(d)!=="svelte-wxr29x"&&(d.innerHTML=ne),N=a(e),c=o(e,"UL",{"data-svelte-h":!0}),r(c)!=="svelte-j2rm7r"&&(c.innerHTML=le),D=a(e),C(g.$$.fragment,e),F=a(e),h=o(e,"P",{"data-svelte-h":!0}),r(h)!=="svelte-mut1up"&&(h.innerHTML=ie),R=a(e),v=o(e,"UL",{"data-svelte-h":!0}),r(v)!=="svelte-q07o07"&&(v.innerHTML=ae),V=a(e),$=o(e,"P",{"data-svelte-h":!0}),r($)!=="svelte-ylkbud"&&($.innerHTML=se),j=a(e),b=o(e,"P",{"data-svelte-h":!0}),r(b)!=="svelte-lhbgtj"&&(b.innerHTML=oe),K=a(e),C(L.$$.fragment,e),O=a(e),M=o(e,"P",{"data-svelte-h":!0}),r(M)!=="svelte-1ebg7ne"&&(M.innerHTML=re),W=a(e),_=o(e,"P",{"data-svelte-h":!0}),r(_)!=="svelte-3apbod"&&(_.textContent=me),B=a(e),C(x.$$.fragment,e),J=a(e),y=o(e,"P",{"data-svelte-h":!0}),r(y)!=="svelte-1lr4w3"&&(y.innerHTML=ue),Q=a(e),C(T.$$.fragment,e),X=a(e),A=o(e,"P",{}),pe(A).forEach(n),this.h()},h(){fe(m,"name","hf:doc:metadata"),fe(m,"content",xe)},m(e,t){be(document.head,m),l(e,E,t),l(e,q,t),l(e,U,t),P(u,e,t),l(e,z,t),P(p,e,t),l(e,G,t),l(e,f,t),l(e,I,t),l(e,d,t),l(e,N,t),l(e,c,t),l(e,D,t),P(g,e,t),l(e,F,t),l(e,h,t),l(e,R,t),l(e,v,t),l(e,V,t),l(e,$,t),l(e,j,t),l(e,b,t),l(e,K,t),P(L,e,t),l(e,O,t),l(e,M,t),l(e,W,t),l(e,_,t),l(e,B,t),P(x,e,t),l(e,J,t),l(e,y,t),l(e,Q,t),P(T,e,t),l(e,X,t),l(e,A,t),Y=!0},p:ce,i(e){Y\|\|(H(u.$$.fragment,e),H(p.$$.fragment,e),H(g.$$.fragment,e),H(L.$$.fragment,e),H(x.$$.fragment,e),H(T.$$.fragment,e),Y=!0)},o(e){k(u.$$.fragment,e),k(p.$$.fragment,e),k(g.$$.fragment,e),k(L.$$.fragment,e),k(x.$$.fragment,e),k(T.$$.fragment,e),Y=!1},d(e){e&&(n(E),n(q),n(U),n(z),n(G),n(f),n(I),n(d),n(N),n(c),n(D),n(F),n(h),n(R),n(v),n(V),n($),n(j),n(b),n(K),n(O),n(M),n(W),n(_),n(B),n(J),n(y),n(Q),n(X),n(A)),n(m),S(u,e),S(p,e),S(g,e),S(L,e),S(x,e),S(T,e)}}}const xe='{"title":"vLLM","local":"vllm","sections":[{"title":"Configuration","local":"configuration","sections":[],"depth":2},{"title":"Supported models","local":"supported-models","sections":[],"depth":2},{"title":"References","local":"references","sections":[],"depth":2}],"depth":1}';function ye(ee){return ge(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Pe extends he{constructor(m){super(),ve(this,m,ye,_e,de,{})}}export{Pe as component};

Xet Storage Details

Size:: 7.14 kB
Xet hash:: 4c742c7950a58a7175b977c8dfe99c69d92fcc7c3b46df549dc555f8b4661f4c

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.