Buckets:

rtrm's picture
download
raw
6.98 kB
import{s as le,o as se,n as ae}from"../chunks/scheduler.7da89386.js";import{S as ne,i as oe,g as r,s as n,r as v,A as ie,h as m,f as l,c as o,j as ee,u as J,x as U,k as te,y as re,a as s,v as H,d as L,t as C,w as Z}from"../chunks/index.0b7befd3.js";import{T as me}from"../chunks/Tip.1e71740f.js";import{C as A}from"../chunks/CodeBlock.ce33a881.js";import{H as pe,E as de}from"../chunks/EditOnGithub.0cb2bc8e.js";function ue(R){let a,d=`In the case of OOM issues, you might need to reduce the context size of the
model as well as reduce the <code>gpu_memory_utilisation</code> parameter.`;return{c(){a=r("p"),a.innerHTML=d},l(i){a=m(i,"P",{"data-svelte-h":!0}),U(a)!=="svelte-by827e"&&(a.innerHTML=d)},m(i,w){s(i,a,w)},p:ae,d(i){i&&l(a)}}}function ce(R){let a,d,i,w,u,j,c,X=`Lighteval allows you to use <code>vllm</code> as backend allowing great speedups.
To use, simply change the <code>model_args</code> to reflect the arguments you want to pass to vllm.`,W,f,I,g,S=`<code>vllm</code> is able to distribute the model across multiple GPUs using data
parallelism, pipeline parallelism or tensor parallelism.
You can choose the parallelism method by setting in the the <code>model_args</code>.`,x,h,Y="For example if you have 4 GPUs you can split it across using <code>tensor_parallelism</code>:",F,M,G,y,B="Or, if your model fits on a single GPU, you can use <code>data_parallelism</code> to speed up the evaluation:",V,T,P,$,O="Available arguments for <code>vllm</code> can be found in the <code>VLLMModelConfig</code>:",q,b,D="<li><strong>pretrained</strong> (str): HuggingFace Hub model ID name or the path to a pre-trained model to load.</li> <li><strong>gpu_memory_utilisation</strong> (float): The fraction of GPU memory to use.</li> <li><strong>revision</strong> (str): The revision of the model.</li> <li><strong>dtype</strong> (str, None): The data type to use for the model.</li> <li><strong>tensor_parallel_size</strong> (int): The number of tensor parallel units to use.</li> <li><strong>data_parallel_size</strong> (int): The number of data parallel units to use.</li> <li><strong>max_model_length</strong> (int): The maximum length of the model.</li> <li><strong>swap_space</strong> (int): The CPU swap space size (GiB) per GPU.</li> <li><strong>seed</strong> (int): The seed to use for the model.</li> <li><strong>trust_remote_code</strong> (bool): Whether to trust remote code during model loading.</li> <li><strong>add_special_tokens</strong> (bool): Whether to add special tokens to the input sequences.</li> <li><strong>multichoice_continuations_start_space</strong> (bool): Whether to add a space at the start of each continuation in multichoice generation.</li>",E,p,z,_,Q,k,N;return u=new pe({props:{title:"Use VLLM as backend",local:"use-vllm-as-backend",headingTag:"h1"}}),f=new A({props:{code:"bGlnaHRldmFsJTIwdmxsbSUyMCU1QyUwQSUyMCUyMCUyMCUyMCUyMnByZXRyYWluZWQlM0RIdWdnaW5nRmFjZUg0JTJGemVwaHlyLTdiLWJldGElMkNkdHlwZSUzRGZsb2F0MTYlMjIlMjAlNUMlMEElMjAlMjAlMjAlMjAlMjJsZWFkZXJib2FyZCU3Q3RydXRoZnVscWElM0FtYyU3QzAlN0MwJTIy",highlighted:`lighteval vllm \\
<span class="hljs-string">&quot;pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16&quot;</span> \\
<span class="hljs-string">&quot;leaderboard|truthfulqa:mc|0|0&quot;</span>`,wrap:!1}}),M=new A({props:{code:"ZXhwb3J0JTIwVkxMTV9XT1JLRVJfTVVMVElQUk9DX01FVEhPRCUzRHNwYXduJTIwJTI2JTI2JTIwbGlnaHRldmFsJTIwdmxsbSUyMCU1QyUwQSUyMCUyMCUyMCUyMCUyMnByZXRyYWluZWQlM0RIdWdnaW5nRmFjZUg0JTJGemVwaHlyLTdiLWJldGElMkNkdHlwZSUzRGZsb2F0MTYlMkN0ZW5zb3JfcGFyYWxsZWxfc2l6ZSUzRDQlMjIlMjAlNUMlMEElMjAlMjAlMjAlMjAlMjJsZWFkZXJib2FyZCU3Q3RydXRoZnVscWElM0FtYyU3QzAlN0MwJTIy",highlighted:`<span class="hljs-built_in">export</span> VLLM_WORKER_MULTIPROC_METHOD=spawn &amp;&amp; lighteval vllm \\
<span class="hljs-string">&quot;pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4&quot;</span> \\
<span class="hljs-string">&quot;leaderboard|truthfulqa:mc|0|0&quot;</span>`,wrap:!1}}),T=new A({props:{code:"bGlnaHRldmFsJTIwdmxsbSUyMCU1QyUwQSUyMCUyMCUyMCUyMCUyMnByZXRyYWluZWQlM0RIdWdnaW5nRmFjZUg0JTJGemVwaHlyLTdiLWJldGElMkNkdHlwZSUzRGZsb2F0MTYlMkNkYXRhX3BhcmFsbGVsX3NpemUlM0Q0JTIyJTIwJTVDJTBBJTIwJTIwJTIwJTIwJTIybGVhZGVyYm9hcmQlN0N0cnV0aGZ1bHFhJTNBbWMlN0MwJTdDMCUyMg==",highlighted:`lighteval vllm \\
<span class="hljs-string">&quot;pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4&quot;</span> \\
<span class="hljs-string">&quot;leaderboard|truthfulqa:mc|0|0&quot;</span>`,wrap:!1}}),p=new me({props:{warning:!0,$$slots:{default:[ue]},$$scope:{ctx:R}}}),_=new de({props:{source:"https://github.com/huggingface/lighteval/blob/main/docs/source/use-vllm-as-backend.mdx"}}),{c(){a=r("meta"),d=n(),i=r("p"),w=n(),v(u.$$.fragment),j=n(),c=r("p"),c.innerHTML=X,W=n(),v(f.$$.fragment),I=n(),g=r("p"),g.innerHTML=S,x=n(),h=r("p"),h.innerHTML=Y,F=n(),v(M.$$.fragment),G=n(),y=r("p"),y.innerHTML=B,V=n(),v(T.$$.fragment),P=n(),$=r("p"),$.innerHTML=O,q=n(),b=r("ul"),b.innerHTML=D,E=n(),v(p.$$.fragment),z=n(),v(_.$$.fragment),Q=n(),k=r("p"),this.h()},l(e){const t=ie("svelte-u9bgzb",document.head);a=m(t,"META",{name:!0,content:!0}),t.forEach(l),d=o(e),i=m(e,"P",{}),ee(i).forEach(l),w=o(e),J(u.$$.fragment,e),j=o(e),c=m(e,"P",{"data-svelte-h":!0}),U(c)!=="svelte-1ki3g9h"&&(c.innerHTML=X),W=o(e),J(f.$$.fragment,e),I=o(e),g=m(e,"P",{"data-svelte-h":!0}),U(g)!=="svelte-1sv8wkg"&&(g.innerHTML=S),x=o(e),h=m(e,"P",{"data-svelte-h":!0}),U(h)!=="svelte-44w0qa"&&(h.innerHTML=Y),F=o(e),J(M.$$.fragment,e),G=o(e),y=m(e,"P",{"data-svelte-h":!0}),U(y)!=="svelte-190jhqw"&&(y.innerHTML=B),V=o(e),J(T.$$.fragment,e),P=o(e),$=m(e,"P",{"data-svelte-h":!0}),U($)!=="svelte-c4hdfo"&&($.innerHTML=O),q=o(e),b=m(e,"UL",{"data-svelte-h":!0}),U(b)!=="svelte-xji1iw"&&(b.innerHTML=D),E=o(e),J(p.$$.fragment,e),z=o(e),J(_.$$.fragment,e),Q=o(e),k=m(e,"P",{}),ee(k).forEach(l),this.h()},h(){te(a,"name","hf:doc:metadata"),te(a,"content",fe)},m(e,t){re(document.head,a),s(e,d,t),s(e,i,t),s(e,w,t),H(u,e,t),s(e,j,t),s(e,c,t),s(e,W,t),H(f,e,t),s(e,I,t),s(e,g,t),s(e,x,t),s(e,h,t),s(e,F,t),H(M,e,t),s(e,G,t),s(e,y,t),s(e,V,t),H(T,e,t),s(e,P,t),s(e,$,t),s(e,q,t),s(e,b,t),s(e,E,t),H(p,e,t),s(e,z,t),H(_,e,t),s(e,Q,t),s(e,k,t),N=!0},p(e,[t]){const K={};t&2&&(K.$$scope={dirty:t,ctx:e}),p.$set(K)},i(e){N||(L(u.$$.fragment,e),L(f.$$.fragment,e),L(M.$$.fragment,e),L(T.$$.fragment,e),L(p.$$.fragment,e),L(_.$$.fragment,e),N=!0)},o(e){C(u.$$.fragment,e),C(f.$$.fragment,e),C(M.$$.fragment,e),C(T.$$.fragment,e),C(p.$$.fragment,e),C(_.$$.fragment,e),N=!1},d(e){e&&(l(d),l(i),l(w),l(j),l(c),l(W),l(I),l(g),l(x),l(h),l(F),l(G),l(y),l(V),l(P),l($),l(q),l(b),l(E),l(z),l(Q),l(k)),l(a),Z(u,e),Z(f,e),Z(M,e),Z(T,e),Z(p,e),Z(_,e)}}}const fe='{"title":"Use VLLM as backend","local":"use-vllm-as-backend","sections":[],"depth":1}';function ge(R){return se(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class be extends ne{constructor(a){super(),oe(this,a,ge,ce,le,{})}}export{be as component};

Xet Storage Details

Size:
6.98 kB
·
Xet hash:
d8ea3f01ef4eeabefffe3ee274631b608ce294378746d6b9b445f054438ae3b5

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.