Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / inference-endpoints /pr_120 /en /_app /immutable /nodes /5.65fbcd34.js

rtrm's picture

about 2 months ago

14.2 kB

	import{s as yt,f as qe,n as Mt,o as Lt}from"../chunks/scheduler.389d799c.js";import{S as wt,i as Tt,g as o,s as i,r as b,A as Ct,h as s,f as l,c as a,j as oe,u as y,x as r,k as m,y as p,a as n,v as M,d as L,t as w,w as T}from"../chunks/index.8f81d18f.js";import{C as xt}from"../chunks/CodeBlock.c0898180.js";import{H as ne}from"../chunks/Heading.41733039.js";import{E as $t}from"../chunks/getInferenceSnippets.93d69c9a.js";function At(ze){let d,se,ie,re,C,me,x,Be="llama.cpp is a high-performance inference engine written in C/C++, tailored for running Llama and compatible models in the GGUF format.",pe,$,Xe="Core features:",ce,A,Ve="<li><strong>GGUF Model Support</strong>: Native compatibility with the GGUF format and all quantization types that comes with it.</li> <li><strong>Multi-Platform</strong>: Optimized for both CPU and GPU execution, with support for AVX, AVX2, AVX512, and CUDA acceleration.</li> <li><strong>OpenAI-Compatible API</strong>: Provides endpoints for chat, completion, embedding, and more, enabling seamless integration with existing tools and workflows.</li> <li><strong>Active Community and Ecosystem</strong>: Rapid development and a rich ecosystem of tools, extensions, and integrations</li>",ue,G,De=`When you create an endpoint with a <a href="https://huggingface.co/docs/hub/en/gguf" rel="nofollow">GGUF</a> model,
	a <a href="https://github.com/ggerganov/llama.cpp" rel="nofollow">llama.cpp</a> container is automatically selected
	using the latest image built from the <code>master</code> branch of the llama.cpp repository.
	Upon successful deployment, a server with an OpenAI-compatible endpoint becomes available.`,fe,H,Je='Llama.cpp supports multiple endpoints like <code>/tokenize</code>, <code>/health</code>, <code>/embedding</code> and many more. For a comprehensive list of available endpoints, please refer to the <a href="https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#api-endpoints" rel="nofollow">API documentation</a>.',de,U,ge,P,Qe="To deploy an endpoint with a llama.cpp container, follow these steps:",he,k,Ke='<li><a href="./create_endpoint">Create a new endpoint</a> and select a repository containing a GGUF model. The llama.cpp container will be automatically selected.</li>',ve,I,et,_e,g,tt="<li>Choose the desired GGUF file, noting that memory requirements will vary depending on the selected file. For example, an F16 model requires more memory than a Q4_K_M model.</li>",be,R,lt,ye,h,nt="<li>Select your desired hardware configuration.</li>",Me,E,it,Le,v,at="<li><p>Optionally, you can customize the container’s configuration settings like <code>Max Tokens</code>, <code>Number of Concurrent Requests</code>. For more information on those, please refer to the <strong>Configurations</strong> section below.</p></li> <li><p>Click the <strong>Create Endpoint</strong> button to complete the deployment.</p></li>",we,S,ot="Alternatively, you can follow the video tutorial below for a step-by-step guide on deploying an endpoint with a llama.cpp container:",Te,u,st,Ce,F,xe,j,rt="The llama.cpp container offers several configuration options that can be adjusted. After deployment, you can modify these settings by accessing the <strong>Settings</strong> tab on the endpoint details page.",$e,q,Ae,Y,mt=`<li><strong>Max Tokens (per Request)</strong>: The maximum number of tokens that can be sent in a single request.</li> <li><strong>Max Concurrent Requests</strong>: The maximum number of concurrent requests allowed for this deployment. Increasing this limit requires additional memory allocation.
	For instance, setting this value to 4 requests with 1024 tokens maximum per request requires memory capacity for 4096 tokens in total.</li>`,Ge,Z,He,N,pt=`In addition to the basic configurations, you can also modify specific settings by setting environment variables.
	A list of available environment variables can be found in the <a href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#usage" rel="nofollow">API documentation</a>.`,Ue,W,ct="Please note that the following environment variables are reserved by the system and cannot be modified:",Pe,O,ut="<li><code>LLAMA_ARG_MODEL</code></li> <li><code>LLAMA_ARG_HTTP_THREADS</code></li> <li><code>LLAMA_ARG_N_GPU_LAYERS</code></li> <li><code>LLAMA_ARG_EMBEDDINGS</code></li> <li><code>LLAMA_ARG_HOST</code></li> <li><code>LLAMA_ARG_PORT</code></li> <li><code>LLAMA_ARG_NO_MMAP</code></li> <li><code>LLAMA_ARG_CTX_SIZE</code></li> <li><code>LLAMA_ARG_N_PARALLEL</code></li> <li><code>LLAMA_ARG_ENDPOINT_METRICS</code></li>",ke,z,Ie,B,ft="In case the deployment fails, please watch the log output for any error messages.",Re,X,dt='You can access the logs by clicking on the <strong>Logs</strong> tab on the endpoint details page. To learn more, refer to the <a href="./logs">Logs</a> documentation.',Ee,f,c,J,gt=`<strong>Malloc failed: out of memory</strong><br/>
	If you see this error message in the log:`,Ye,V,Ze,Q,ht="That means the selected hardware configuration does not have enough memory to accommodate the selected GGUF model. You can try to:",Ne,K,vt="<li>Lower the number of maximum tokens per request</li> <li>Lower the number of concurrent requests</li> <li>Select a smaller GGUF model</li> <li>Select a larger hardware configuration</li>",We,ee,_t=`<p><strong>Workload evicted, storage limit exceeded</strong><br/>
	This error message indicates that the hardware has too little memory to accommodate the selected GGUF model. Try selecting a smaller model or select a larger hardware configuration.</p>`,Oe,te,bt=`<p><strong>Other problems</strong><br/>
	For other problems, please refer to the <a href="https://github.com/ggerganov/llama.cpp/issues" rel="nofollow">llama.cpp issues page</a>. In case you want to create a new issue, please also include the full log output in your bug report.</p>`,Se,D,Fe,ae,je;return C=new ne({props:{title:"llama.cpp",local:"llamacpp",headingTag:"h1"}}),U=new ne({props:{title:"Deployment Steps",local:"deployment-steps",headingTag:"h2"}}),F=new ne({props:{title:"Configurations",local:"configurations",headingTag:"h2"}}),q=new ne({props:{title:"Basic Configurations",local:"basic-configurations",headingTag:"h3"}}),Z=new ne({props:{title:"Advanced Configurations",local:"advanced-configurations",headingTag:"h3"}}),z=new ne({props:{title:"Troubleshooting",local:"troubleshooting",headingTag:"h2"}}),V=new xt({props:{code:"Z2dtbF9iYWNrZW5kX2N1ZGFfYnVmZmVyX3R5cGVfYWxsb2NfYnVmZmVyJTNBJTIwYWxsb2NhdGluZyUyMDY3MjAwLjAwJTIwTWlCJTIwb24lMjBkZXZpY2UlMjAwJTNBJTIwY3VkYSUwQU1hbGxvYyUyMGZhaWxlZCUzQSUyMG91dCUyMG9mJTIwbWVtb3J5JTBBbGxhbWFfa3ZfY2FjaGVfaW5pdCUzQSUyMGZhaWxlZCUyMHRvJTIwYWxsb2NhdGUlMjBidWZmZXIlMjBmb3IlMjBrdiUyMGNhY2hlJTBBbGxhbWFfbmV3X2NvbnRleHRfd2l0aF9tb2RlbCUzQSUyMGxsYW1hX2t2X2NhY2hlX2luaXQoKSUyMGZhaWxlZCUyMGZvciUyMHNlbGYtYXR0ZW50aW9uJTIwY2FjaGUlMEEuLi4=",highlighted:`ggml_backend_cuda_buffer_type_alloc_buffer: allocating <span class="hljs-number">67200.00</span> MiB <span class="hljs-keyword">on</span> device <span class="hljs-number">0</span>: cuda
	Malloc failed: out of memory
	llama_kv_cache_init: failed <span class="hljs-keyword">to</span> allocate buffer for kv <span class="hljs-keyword">cache</span>
	llama_new_context_with_model: llama_kv_cache_init() failed for <span class="hljs-built_in">self</span><span class="hljs-params">-attention</span> <span class="hljs-keyword">cache</span>
	<span class="hljs-params">...</span>`,wrap:!1}}),D=new $t({props:{source:"https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/engines/llama_cpp.mdx"}}),{c(){d=o("meta"),se=i(),ie=o("p"),re=i(),b(C.$$.fragment),me=i(),x=o("p"),x.textContent=Be,pe=i(),$=o("p"),$.textContent=Xe,ce=i(),A=o("ul"),A.innerHTML=Ve,ue=i(),G=o("p"),G.innerHTML=De,fe=i(),H=o("p"),H.innerHTML=Je,de=i(),b(U.$$.fragment),ge=i(),P=o("p"),P.textContent=Qe,he=i(),k=o("ol"),k.innerHTML=Ke,ve=i(),I=o("img"),_e=i(),g=o("ol"),g.innerHTML=tt,be=i(),R=o("img"),ye=i(),h=o("ol"),h.innerHTML=nt,Me=i(),E=o("img"),Le=i(),v=o("ol"),v.innerHTML=at,we=i(),S=o("p"),S.textContent=ot,Te=i(),u=o("video"),Ce=i(),b(F.$$.fragment),xe=i(),j=o("p"),j.innerHTML=rt,$e=i(),b(q.$$.fragment),Ae=i(),Y=o("ul"),Y.innerHTML=mt,Ge=i(),b(Z.$$.fragment),He=i(),N=o("p"),N.innerHTML=pt,Ue=i(),W=o("p"),W.textContent=ct,Pe=i(),O=o("ul"),O.innerHTML=ut,ke=i(),b(z.$$.fragment),Ie=i(),B=o("p"),B.textContent=ft,Re=i(),X=o("p"),X.innerHTML=dt,Ee=i(),f=o("ul"),c=o("li"),J=o("p"),J.innerHTML=gt,Ye=i(),b(V.$$.fragment),Ze=i(),Q=o("p"),Q.textContent=ht,Ne=i(),K=o("ul"),K.innerHTML=vt,We=i(),ee=o("li"),ee.innerHTML=_t,Oe=i(),te=o("li"),te.innerHTML=bt,Se=i(),b(D.$$.fragment),Fe=i(),ae=o("p"),this.h()},l(e){const t=Ct("svelte-u9bgzb",document.head);d=s(t,"META",{name:!0,content:!0}),t.forEach(l),se=a(e),ie=s(e,"P",{}),oe(ie).forEach(l),re=a(e),y(C.$$.fragment,e),me=a(e),x=s(e,"P",{"data-svelte-h":!0}),r(x)!=="svelte-17wgbkm"&&(x.textContent=Be),pe=a(e),$=s(e,"P",{"data-svelte-h":!0}),r($)!=="svelte-plvtos"&&($.textContent=Xe),ce=a(e),A=s(e,"UL",{"data-svelte-h":!0}),r(A)!=="svelte-1o15oyk"&&(A.innerHTML=Ve),ue=a(e),G=s(e,"P",{"data-svelte-h":!0}),r(G)!=="svelte-qmge18"&&(G.innerHTML=De),fe=a(e),H=s(e,"P",{"data-svelte-h":!0}),r(H)!=="svelte-4ukih7"&&(H.innerHTML=Je),de=a(e),y(U.$$.fragment,e),ge=a(e),P=s(e,"P",{"data-svelte-h":!0}),r(P)!=="svelte-1viuqt7"&&(P.textContent=Qe),he=a(e),k=s(e,"OL",{"data-svelte-h":!0}),r(k)!=="svelte-zqz83d"&&(k.innerHTML=Ke),ve=a(e),I=s(e,"IMG",{src:!0,alt:!0}),_e=a(e),g=s(e,"OL",{start:!0,"data-svelte-h":!0}),r(g)!=="svelte-a3ltnp"&&(g.innerHTML=tt),be=a(e),R=s(e,"IMG",{src:!0,alt:!0}),ye=a(e),h=s(e,"OL",{start:!0,"data-svelte-h":!0}),r(h)!=="svelte-1mdwz6y"&&(h.innerHTML=nt),Me=a(e),E=s(e,"IMG",{src:!0,alt:!0}),Le=a(e),v=s(e,"OL",{start:!0,"data-svelte-h":!0}),r(v)!=="svelte-1vuz66q"&&(v.innerHTML=at),we=a(e),S=s(e,"P",{"data-svelte-h":!0}),r(S)!=="svelte-174w0va"&&(S.textContent=ot),Te=a(e),u=s(e,"VIDEO",{width:!0,height:!0,src:!0}),oe(u).forEach(l),Ce=a(e),y(F.$$.fragment,e),xe=a(e),j=s(e,"P",{"data-svelte-h":!0}),r(j)!=="svelte-jdgis4"&&(j.innerHTML=rt),$e=a(e),y(q.$$.fragment,e),Ae=a(e),Y=s(e,"UL",{"data-svelte-h":!0}),r(Y)!=="svelte-d1iatn"&&(Y.innerHTML=mt),Ge=a(e),y(Z.$$.fragment,e),He=a(e),N=s(e,"P",{"data-svelte-h":!0}),r(N)!=="svelte-1sl7l1d"&&(N.innerHTML=pt),Ue=a(e),W=s(e,"P",{"data-svelte-h":!0}),r(W)!=="svelte-omzti"&&(W.textContent=ct),Pe=a(e),O=s(e,"UL",{"data-svelte-h":!0}),r(O)!=="svelte-apzx5r"&&(O.innerHTML=ut),ke=a(e),y(z.$$.fragment,e),Ie=a(e),B=s(e,"P",{"data-svelte-h":!0}),r(B)!=="svelte-sywduw"&&(B.textContent=ft),Re=a(e),X=s(e,"P",{"data-svelte-h":!0}),r(X)!=="svelte-dnupoj"&&(X.innerHTML=dt),Ee=a(e),f=s(e,"UL",{});var le=oe(f);c=s(le,"LI",{});var _=oe(c);J=s(_,"P",{"data-svelte-h":!0}),r(J)!=="svelte-14vukni"&&(J.innerHTML=gt),Ye=a(_),y(V.$$.fragment,_),Ze=a(_),Q=s(_,"P",{"data-svelte-h":!0}),r(Q)!=="svelte-ygfqn1"&&(Q.textContent=ht),Ne=a(_),K=s(_,"UL",{"data-svelte-h":!0}),r(K)!=="svelte-7cq89u"&&(K.innerHTML=vt),_.forEach(l),We=a(le),ee=s(le,"LI",{"data-svelte-h":!0}),r(ee)!=="svelte-9ofzm8"&&(ee.innerHTML=_t),Oe=a(le),te=s(le,"LI",{"data-svelte-h":!0}),r(te)!=="svelte-xbgvsk"&&(te.innerHTML=bt),le.forEach(l),Se=a(e),y(D.$$.fragment,e),Fe=a(e),ae=s(e,"P",{}),oe(ae).forEach(l),this.h()},h(){m(d,"name","hf:doc:metadata"),m(d,"content",Gt),qe(I.src,et="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/endpoints/llamacpp_1.png")\|\|m(I,"src",et),m(I,"alt","Select model"),m(g,"start","2"),qe(R.src,lt="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/endpoints/llamacpp_2.png")\|\|m(R,"src",lt),m(R,"alt","Select GGUF file"),m(h,"start","3"),qe(E.src,it="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/endpoints/llamacpp_3.png")\|\|m(E,"src",it),m(E,"alt","Select hardware"),m(v,"start","4"),m(u,"width","1280"),m(u,"height","720"),qe(u.src,st="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/endpoints/llamacpp_guide_video.mp4")\|\|m(u,"src",st),u.controls="true"},m(e,t){p(document.head,d),n(e,se,t),n(e,ie,t),n(e,re,t),M(C,e,t),n(e,me,t),n(e,x,t),n(e,pe,t),n(e,$,t),n(e,ce,t),n(e,A,t),n(e,ue,t),n(e,G,t),n(e,fe,t),n(e,H,t),n(e,de,t),M(U,e,t),n(e,ge,t),n(e,P,t),n(e,he,t),n(e,k,t),n(e,ve,t),n(e,I,t),n(e,_e,t),n(e,g,t),n(e,be,t),n(e,R,t),n(e,ye,t),n(e,h,t),n(e,Me,t),n(e,E,t),n(e,Le,t),n(e,v,t),n(e,we,t),n(e,S,t),n(e,Te,t),n(e,u,t),n(e,Ce,t),M(F,e,t),n(e,xe,t),n(e,j,t),n(e,$e,t),M(q,e,t),n(e,Ae,t),n(e,Y,t),n(e,Ge,t),M(Z,e,t),n(e,He,t),n(e,N,t),n(e,Ue,t),n(e,W,t),n(e,Pe,t),n(e,O,t),n(e,ke,t),M(z,e,t),n(e,Ie,t),n(e,B,t),n(e,Re,t),n(e,X,t),n(e,Ee,t),n(e,f,t),p(f,c),p(c,J),p(c,Ye),M(V,c,null),p(c,Ze),p(c,Q),p(c,Ne),p(c,K),p(f,We),p(f,ee),p(f,Oe),p(f,te),n(e,Se,t),M(D,e,t),n(e,Fe,t),n(e,ae,t),je=!0},p:Mt,i(e){je\|\|(L(C.$$.fragment,e),L(U.$$.fragment,e),L(F.$$.fragment,e),L(q.$$.fragment,e),L(Z.$$.fragment,e),L(z.$$.fragment,e),L(V.$$.fragment,e),L(D.$$.fragment,e),je=!0)},o(e){w(C.$$.fragment,e),w(U.$$.fragment,e),w(F.$$.fragment,e),w(q.$$.fragment,e),w(Z.$$.fragment,e),w(z.$$.fragment,e),w(V.$$.fragment,e),w(D.$$.fragment,e),je=!1},d(e){e&&(l(se),l(ie),l(re),l(me),l(x),l(pe),l($),l(ce),l(A),l(ue),l(G),l(fe),l(H),l(de),l(ge),l(P),l(he),l(k),l(ve),l(I),l(_e),l(g),l(be),l(R),l(ye),l(h),l(Me),l(E),l(Le),l(v),l(we),l(S),l(Te),l(u),l(Ce),l(xe),l(j),l($e),l(Ae),l(Y),l(Ge),l(He),l(N),l(Ue),l(W),l(Pe),l(O),l(ke),l(Ie),l(B),l(Re),l(X),l(Ee),l(f),l(Se),l(Fe),l(ae)),l(d),T(C,e),T(U,e),T(F,e),T(q,e),T(Z,e),T(z,e),T(V),T(D,e)}}}const Gt='{"title":"llama.cpp","local":"llamacpp","sections":[{"title":"Deployment Steps","local":"deployment-steps","sections":[],"depth":2},{"title":"Configurations","local":"configurations","sections":[{"title":"Basic Configurations","local":"basic-configurations","sections":[],"depth":3},{"title":"Advanced Configurations","local":"advanced-configurations","sections":[],"depth":3}],"depth":2},{"title":"Troubleshooting","local":"troubleshooting","sections":[],"depth":2}],"depth":1}';function Ht(ze){return Lt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Et extends wt{constructor(d){super(),Tt(this,d,Ht,At,yt,{})}}export{Et as component};

Xet Storage Details

Size:: 14.2 kB
Xet hash:: 4dd85ace2a54a93a533f4758ecb665dfa29a55b54aad44084c9069718df37002

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.