Buckets:
| import{s as de,n as ce,o as ge}from"../chunks/scheduler.eb244325.js";import{S as he,i as ve,e as s,s as i,c as w,h as $e,a as o,d as n,b as a,f as pe,g as C,j as r,k as fe,l as be,m as l,n as P,t as H,o as k,p as S}from"../chunks/index.661680a1.js";import{C as Le,H as Z,E as Me}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.83ba2125.js";function _e(ee){let m,E,q,U,u,z,p,G,f,te=`vLLM is a high-performance, memory-efficient inference engine for open-source LLMs. It delivers efficient scheduling, KV-cache handling, | |
| batching, and decoding—all wrapped in a production-ready server. For most use cases, TGI, vLLM, and SGLang will be equivalently good options.`,I,d,ne="<strong>Core features</strong>:",N,c,le="<li><strong>PagedAttention for memory efficiency</strong></li> <li><strong>Continuous batching</strong></li> <li><strong>Optimized CUDA/HIP execution</strong></li> <li><strong>Speculative decoding & chunked prefill</strong></li> <li><strong>Multi-backend and hardware support</strong>: Runs across NVIDIA, AMD, and AWS Neuron to name a few</li>",D,g,F,h,ie='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/vllm/vllm_config.png" alt="config"/>',R,v,ae=`<li><strong>Max Number of Sequences</strong>: The maximum number of sequences (requests) that can be processed together in a single batch. Controls | |
| the batch size by sequence count, affecting throughput and memory usage. For example, if max_num_seqs=8, up to 8 different prompts can | |
| be handled at once, regardless of their individual lengths, as long as the total token count also fits within the Max Number of Batched Tokens.</li> <li><strong>Max Number of Batched Tokens</strong>: The maximum total number of tokens (summed across all sequences) that can be processed in a single | |
| batch. Limits batch size by token count, balancing throughput and GPU memory allocation.</li> <li><strong>Tensor Parallel Size</strong>: The number of GPUs across which model weights are split within each layer. Increasing this allows larger | |
| models to run and frees up GPU memory for KV cache, but may introduce synchronization overhead.</li> <li><strong>KV Cache DType</strong>: the data type used for storing the key-value cache during generation. Options include “auto”, “fp8”, “fp8_e5m2”, | |
| and “fp8_e4m3”. Using lower precision types can reduce memory usage but may slightly impact generation quality.</li>`,V,$,se=`For more advanced configuration you can pass any of the <a href="https://docs.vllm.ai/en/stable/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs" rel="nofollow">Engine Arguments that vLLM supports</a> | |
| as container arguments. For example changing the <code>enable_lora</code> to <code>true</code> would look like this:`,j,b,oe='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/vllm/vllm-advanced.png" alt="vllm-advanced"/>',K,L,O,M,re=`vLLM has wide support for large language models and embedding models. We recommend reading the | |
| <a href="https://docs.vllm.ai/en/stable/models/supported_models.html?h=supported+models" rel="nofollow">supported models</a> section in the vLLM documentation for a full list.`,W,_,me=`vLLM also supports model implementations that are available in Transformers. Currently not all models work but support is planned for most | |
| decoder language models are supported, and vision language models.`,B,x,J,y,ue='We also recommend reading the <a href="https://docs.vllm.ai/en/stable/" rel="nofollow">vLLM documentation</a> for more in-depth information.',Q,T,X,A,Y;return u=new Le({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),p=new Z({props:{title:"vLLM",local:"vllm",headingTag:"h1"}}),g=new Z({props:{title:"Configuration",local:"configuration",headingTag:"h2"}}),L=new Z({props:{title:"Supported models",local:"supported-models",headingTag:"h2"}}),x=new Z({props:{title:"References",local:"references",headingTag:"h2"}}),T=new Me({props:{source:"https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/engines/vllm.md"}}),{c(){m=s("meta"),E=i(),q=s("p"),U=i(),w(u.$$.fragment),z=i(),w(p.$$.fragment),G=i(),f=s("p"),f.textContent=te,I=i(),d=s("p"),d.innerHTML=ne,N=i(),c=s("ul"),c.innerHTML=le,D=i(),w(g.$$.fragment),F=i(),h=s("p"),h.innerHTML=ie,R=i(),v=s("ul"),v.innerHTML=ae,V=i(),$=s("p"),$.innerHTML=se,j=i(),b=s("p"),b.innerHTML=oe,K=i(),w(L.$$.fragment),O=i(),M=s("p"),M.innerHTML=re,W=i(),_=s("p"),_.textContent=me,B=i(),w(x.$$.fragment),J=i(),y=s("p"),y.innerHTML=ue,Q=i(),w(T.$$.fragment),X=i(),A=s("p"),this.h()},l(e){const t=$e("svelte-u9bgzb",document.head);m=o(t,"META",{name:!0,content:!0}),t.forEach(n),E=a(e),q=o(e,"P",{}),pe(q).forEach(n),U=a(e),C(u.$$.fragment,e),z=a(e),C(p.$$.fragment,e),G=a(e),f=o(e,"P",{"data-svelte-h":!0}),r(f)!=="svelte-3mges0"&&(f.textContent=te),I=a(e),d=o(e,"P",{"data-svelte-h":!0}),r(d)!=="svelte-wxr29x"&&(d.innerHTML=ne),N=a(e),c=o(e,"UL",{"data-svelte-h":!0}),r(c)!=="svelte-j2rm7r"&&(c.innerHTML=le),D=a(e),C(g.$$.fragment,e),F=a(e),h=o(e,"P",{"data-svelte-h":!0}),r(h)!=="svelte-mut1up"&&(h.innerHTML=ie),R=a(e),v=o(e,"UL",{"data-svelte-h":!0}),r(v)!=="svelte-q07o07"&&(v.innerHTML=ae),V=a(e),$=o(e,"P",{"data-svelte-h":!0}),r($)!=="svelte-ylkbud"&&($.innerHTML=se),j=a(e),b=o(e,"P",{"data-svelte-h":!0}),r(b)!=="svelte-lhbgtj"&&(b.innerHTML=oe),K=a(e),C(L.$$.fragment,e),O=a(e),M=o(e,"P",{"data-svelte-h":!0}),r(M)!=="svelte-1ebg7ne"&&(M.innerHTML=re),W=a(e),_=o(e,"P",{"data-svelte-h":!0}),r(_)!=="svelte-3apbod"&&(_.textContent=me),B=a(e),C(x.$$.fragment,e),J=a(e),y=o(e,"P",{"data-svelte-h":!0}),r(y)!=="svelte-1lr4w3"&&(y.innerHTML=ue),Q=a(e),C(T.$$.fragment,e),X=a(e),A=o(e,"P",{}),pe(A).forEach(n),this.h()},h(){fe(m,"name","hf:doc:metadata"),fe(m,"content",xe)},m(e,t){be(document.head,m),l(e,E,t),l(e,q,t),l(e,U,t),P(u,e,t),l(e,z,t),P(p,e,t),l(e,G,t),l(e,f,t),l(e,I,t),l(e,d,t),l(e,N,t),l(e,c,t),l(e,D,t),P(g,e,t),l(e,F,t),l(e,h,t),l(e,R,t),l(e,v,t),l(e,V,t),l(e,$,t),l(e,j,t),l(e,b,t),l(e,K,t),P(L,e,t),l(e,O,t),l(e,M,t),l(e,W,t),l(e,_,t),l(e,B,t),P(x,e,t),l(e,J,t),l(e,y,t),l(e,Q,t),P(T,e,t),l(e,X,t),l(e,A,t),Y=!0},p:ce,i(e){Y||(H(u.$$.fragment,e),H(p.$$.fragment,e),H(g.$$.fragment,e),H(L.$$.fragment,e),H(x.$$.fragment,e),H(T.$$.fragment,e),Y=!0)},o(e){k(u.$$.fragment,e),k(p.$$.fragment,e),k(g.$$.fragment,e),k(L.$$.fragment,e),k(x.$$.fragment,e),k(T.$$.fragment,e),Y=!1},d(e){e&&(n(E),n(q),n(U),n(z),n(G),n(f),n(I),n(d),n(N),n(c),n(D),n(F),n(h),n(R),n(v),n(V),n($),n(j),n(b),n(K),n(O),n(M),n(W),n(_),n(B),n(J),n(y),n(Q),n(X),n(A)),n(m),S(u,e),S(p,e),S(g,e),S(L,e),S(x,e),S(T,e)}}}const xe='{"title":"vLLM","local":"vllm","sections":[{"title":"Configuration","local":"configuration","sections":[],"depth":2},{"title":"Supported models","local":"supported-models","sections":[],"depth":2},{"title":"References","local":"references","sections":[],"depth":2}],"depth":1}';function ye(ee){return ge(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Pe extends he{constructor(m){super(),ve(this,m,ye,_e,de,{})}}export{Pe as component}; | |
Xet Storage Details
- Size:
- 7.14 kB
- Xet hash:
- 4c742c7950a58a7175b977c8dfe99c69d92fcc7c3b46df549dc555f8b4661f4c
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.