Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / inference-endpoints /pr_153 /en /_app /immutable /nodes /5.b4e53dab.js

rtrm's picture

about 2 months ago

14.5 kB

	import{s as Lt,a as Ze,n as wt,o as $t}from"../chunks/scheduler.eb244325.js";import{S as Tt,i as xt,e as o,s as i,c as g,h as Ct,a as s,d as l,b as a,f as se,g as h,j as r,k as m,l as p,m as n,n as v,t as _,o as b,p as y}from"../chunks/index.661680a1.js";import{C as At,H as ie,E as Gt}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.5f6741e0.js";import{C as Ht}from"../chunks/CodeBlock.bffec0b4.js";function Ut(Xe){let d,re,ae,me,T,pe,x,ce,C,Ve="llama.cpp is a high-performance inference engine written in C/C++, tailored for running Llama and compatible models in the GGUF format.",ue,A,De="Core features:",fe,G,Je="<li><strong>GGUF Model Support</strong>: Native compatibility with the GGUF format and all quantization types that comes with it.</li> <li><strong>Multi-Platform</strong>: Optimized for both CPU and GPU execution, with support for AVX, AVX2, AVX512, and CUDA acceleration.</li> <li><strong>OpenAI-Compatible API</strong>: Provides endpoints for chat, completion, embedding, and more, enabling seamless integration with existing tools and workflows.</li> <li><strong>Active Community and Ecosystem</strong>: Rapid development and a rich ecosystem of tools, extensions, and integrations</li>",de,H,Qe=`When you create an endpoint with a <a href="https://huggingface.co/docs/hub/en/gguf" rel="nofollow">GGUF</a> model,
	a <a href="https://github.com/ggerganov/llama.cpp" rel="nofollow">llama.cpp</a> container is automatically selected
	using the latest image built from the <code>master</code> branch of the llama.cpp repository.
	Upon successful deployment, a server with an OpenAI-compatible endpoint becomes available.`,ge,U,Ke='llama.cpp supports multiple endpoints like <code>/tokenize</code>, <code>/health</code>, <code>/embedding</code>, and many more. For a comprehensive list of available endpoints, please refer to the <a href="https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#api-endpoints" rel="nofollow">API documentation</a>.',he,P,ve,k,et="To deploy an endpoint with a llama.cpp container, follow these steps:",_e,I,tt='<li><a href="./create_endpoint">Create a new endpoint</a> and select a repository containing a GGUF model. The llama.cpp container will be automatically selected.</li>',be,R,lt,ye,M,nt="<li>Choose the desired GGUF file, noting that memory requirements will vary depending on the selected file. For example, an F16 model requires more memory than a Q4_K_M model.</li>",Me,E,it,Le,L,at="<li>Select your desired hardware configuration.</li>",we,S,ot,$e,w,st="<li><p>Optionally, you can customize the container’s configuration settings like <code>Max Tokens</code>, <code>Number of Concurrent Requests</code>. For more information on those, please refer to the <strong>Configurations</strong> section below.</p></li> <li><p>Click the <strong>Create Endpoint</strong> button to complete the deployment.</p></li>",Te,j,rt="Alternatively, you can follow the video tutorial below for a step-by-step guide on deploying an endpoint with a llama.cpp container:",xe,u,mt,Ce,F,Ae,q,pt="The llama.cpp container offers several configuration options that can be adjusted. After deployment, you can modify these settings by accessing the <strong>Settings</strong> tab on the endpoint details page.",Ge,Y,He,Z,ct=`<li><strong>Max Tokens (per Request)</strong>: The maximum number of tokens that can be sent in a single request.</li> <li><strong>Max Concurrent Requests</strong>: The maximum number of concurrent requests allowed for this deployment. Increasing this limit requires additional memory allocation.
	For instance, setting this value to 4 requests with 1024 tokens maximum per request requires memory capacity for 4096 tokens in total.</li>`,Ue,N,Pe,W,ut=`In addition to the basic configurations, you can also modify specific settings by setting environment variables.
	A list of available environment variables can be found in the <a href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#usage" rel="nofollow">API documentation</a>.`,ke,z,ft="Please note that the following environment variables are reserved by the system and cannot be modified:",Ie,O,dt="<li><code>LLAMA_ARG_MODEL</code></li> <li><code>LLAMA_ARG_HTTP_THREADS</code></li> <li><code>LLAMA_ARG_N_GPU_LAYERS</code></li> <li><code>LLAMA_ARG_EMBEDDINGS</code></li> <li><code>LLAMA_ARG_HOST</code></li> <li><code>LLAMA_ARG_PORT</code></li> <li><code>LLAMA_ARG_NO_MMAP</code></li> <li><code>LLAMA_ARG_CTX_SIZE</code></li> <li><code>LLAMA_ARG_N_PARALLEL</code></li> <li><code>LLAMA_ARG_ENDPOINT_METRICS</code></li>",Re,B,Ee,X,gt="In case the deployment fails, please watch the log output for any error messages.",Se,V,ht='You can access the logs by clicking on the <strong>Logs</strong> tab on the endpoint details page. To learn more, refer to the <a href="./logs">Logs</a> documentation.',je,f,c,Q,vt=`<strong>Malloc failed: out of memory</strong><br/>
	If you see this error message in the log:`,Ne,D,We,K,_t="That means the selected hardware configuration does not have enough memory to accommodate the selected GGUF model. You can try to:",ze,ee,bt="<li>Lower the number of maximum tokens per request</li> <li>Lower the number of concurrent requests</li> <li>Select a smaller GGUF model</li> <li>Select a larger hardware configuration</li>",Oe,te,yt=`<p><strong>Workload evicted, storage limit exceeded</strong><br/>
	This error message indicates that the hardware has too little memory to accommodate the selected GGUF model. Try selecting a smaller model or select a larger hardware configuration.</p>`,Be,le,Mt=`<p><strong>Other problems</strong><br/>
	For other problems, please refer to the <a href="https://github.com/ggerganov/llama.cpp/issues" rel="nofollow">llama.cpp issues page</a>. In case you want to create a new issue, please also include the full log output in your bug report.</p>`,Fe,J,qe,oe,Ye;return T=new At({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),x=new ie({props:{title:"llama.cpp",local:"llamacpp",headingTag:"h1"}}),P=new ie({props:{title:"Deployment Steps",local:"deployment-steps",headingTag:"h2"}}),F=new ie({props:{title:"Configurations",local:"configurations",headingTag:"h2"}}),Y=new ie({props:{title:"Basic Configurations",local:"basic-configurations",headingTag:"h3"}}),N=new ie({props:{title:"Advanced Configurations",local:"advanced-configurations",headingTag:"h3"}}),B=new ie({props:{title:"Troubleshooting",local:"troubleshooting",headingTag:"h2"}}),D=new Ht({props:{code:"Z2dtbF9iYWNrZW5kX2N1ZGFfYnVmZmVyX3R5cGVfYWxsb2NfYnVmZmVyJTNBJTIwYWxsb2NhdGluZyUyMDY3MjAwLjAwJTIwTWlCJTIwb24lMjBkZXZpY2UlMjAwJTNBJTIwY3VkYSUwQU1hbGxvYyUyMGZhaWxlZCUzQSUyMG91dCUyMG9mJTIwbWVtb3J5JTBBbGxhbWFfa3ZfY2FjaGVfaW5pdCUzQSUyMGZhaWxlZCUyMHRvJTIwYWxsb2NhdGUlMjBidWZmZXIlMjBmb3IlMjBrdiUyMGNhY2hlJTBBbGxhbWFfbmV3X2NvbnRleHRfd2l0aF9tb2RlbCUzQSUyMGxsYW1hX2t2X2NhY2hlX2luaXQoKSUyMGZhaWxlZCUyMGZvciUyMHNlbGYtYXR0ZW50aW9uJTIwY2FjaGUlMEEuLi4=",highlighted:`ggml_backend_cuda_buffer_type_alloc_buffer: allocating <span class="hljs-number">67200.00</span> MiB <span class="hljs-keyword">on</span> device <span class="hljs-number">0</span>: cuda
	Malloc failed: out of memory
	llama_kv_cache_init: failed <span class="hljs-keyword">to</span> allocate buffer for kv <span class="hljs-keyword">cache</span>
	llama_new_context_with_model: llama_kv_cache_init() failed for <span class="hljs-built_in">self</span><span class="hljs-params">-attention</span> <span class="hljs-keyword">cache</span>
	<span class="hljs-params">...</span>`,wrap:!1}}),J=new Gt({props:{source:"https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/engines/llama_cpp.md"}}),{c(){d=o("meta"),re=i(),ae=o("p"),me=i(),g(T.$$.fragment),pe=i(),g(x.$$.fragment),ce=i(),C=o("p"),C.textContent=Ve,ue=i(),A=o("p"),A.textContent=De,fe=i(),G=o("ul"),G.innerHTML=Je,de=i(),H=o("p"),H.innerHTML=Qe,ge=i(),U=o("p"),U.innerHTML=Ke,he=i(),g(P.$$.fragment),ve=i(),k=o("p"),k.textContent=et,_e=i(),I=o("ol"),I.innerHTML=tt,be=i(),R=o("img"),ye=i(),M=o("ol"),M.innerHTML=nt,Me=i(),E=o("img"),Le=i(),L=o("ol"),L.innerHTML=at,we=i(),S=o("img"),$e=i(),w=o("ol"),w.innerHTML=st,Te=i(),j=o("p"),j.textContent=rt,xe=i(),u=o("video"),Ce=i(),g(F.$$.fragment),Ae=i(),q=o("p"),q.innerHTML=pt,Ge=i(),g(Y.$$.fragment),He=i(),Z=o("ul"),Z.innerHTML=ct,Ue=i(),g(N.$$.fragment),Pe=i(),W=o("p"),W.innerHTML=ut,ke=i(),z=o("p"),z.textContent=ft,Ie=i(),O=o("ul"),O.innerHTML=dt,Re=i(),g(B.$$.fragment),Ee=i(),X=o("p"),X.textContent=gt,Se=i(),V=o("p"),V.innerHTML=ht,je=i(),f=o("ul"),c=o("li"),Q=o("p"),Q.innerHTML=vt,Ne=i(),g(D.$$.fragment),We=i(),K=o("p"),K.textContent=_t,ze=i(),ee=o("ul"),ee.innerHTML=bt,Oe=i(),te=o("li"),te.innerHTML=yt,Be=i(),le=o("li"),le.innerHTML=Mt,Fe=i(),g(J.$$.fragment),qe=i(),oe=o("p"),this.h()},l(e){const t=Ct("svelte-u9bgzb",document.head);d=s(t,"META",{name:!0,content:!0}),t.forEach(l),re=a(e),ae=s(e,"P",{}),se(ae).forEach(l),me=a(e),h(T.$$.fragment,e),pe=a(e),h(x.$$.fragment,e),ce=a(e),C=s(e,"P",{"data-svelte-h":!0}),r(C)!=="svelte-17wgbkm"&&(C.textContent=Ve),ue=a(e),A=s(e,"P",{"data-svelte-h":!0}),r(A)!=="svelte-plvtos"&&(A.textContent=De),fe=a(e),G=s(e,"UL",{"data-svelte-h":!0}),r(G)!=="svelte-1o15oyk"&&(G.innerHTML=Je),de=a(e),H=s(e,"P",{"data-svelte-h":!0}),r(H)!=="svelte-qmge18"&&(H.innerHTML=Qe),ge=a(e),U=s(e,"P",{"data-svelte-h":!0}),r(U)!=="svelte-1nwwlvj"&&(U.innerHTML=Ke),he=a(e),h(P.$$.fragment,e),ve=a(e),k=s(e,"P",{"data-svelte-h":!0}),r(k)!=="svelte-1viuqt7"&&(k.textContent=et),_e=a(e),I=s(e,"OL",{"data-svelte-h":!0}),r(I)!=="svelte-zqz83d"&&(I.innerHTML=tt),be=a(e),R=s(e,"IMG",{src:!0,alt:!0}),ye=a(e),M=s(e,"OL",{start:!0,"data-svelte-h":!0}),r(M)!=="svelte-a3ltnp"&&(M.innerHTML=nt),Me=a(e),E=s(e,"IMG",{src:!0,alt:!0}),Le=a(e),L=s(e,"OL",{start:!0,"data-svelte-h":!0}),r(L)!=="svelte-1mdwz6y"&&(L.innerHTML=at),we=a(e),S=s(e,"IMG",{src:!0,alt:!0}),$e=a(e),w=s(e,"OL",{start:!0,"data-svelte-h":!0}),r(w)!=="svelte-1vuz66q"&&(w.innerHTML=st),Te=a(e),j=s(e,"P",{"data-svelte-h":!0}),r(j)!=="svelte-174w0va"&&(j.textContent=rt),xe=a(e),u=s(e,"VIDEO",{width:!0,height:!0,src:!0}),se(u).forEach(l),Ce=a(e),h(F.$$.fragment,e),Ae=a(e),q=s(e,"P",{"data-svelte-h":!0}),r(q)!=="svelte-jdgis4"&&(q.innerHTML=pt),Ge=a(e),h(Y.$$.fragment,e),He=a(e),Z=s(e,"UL",{"data-svelte-h":!0}),r(Z)!=="svelte-d1iatn"&&(Z.innerHTML=ct),Ue=a(e),h(N.$$.fragment,e),Pe=a(e),W=s(e,"P",{"data-svelte-h":!0}),r(W)!=="svelte-1sl7l1d"&&(W.innerHTML=ut),ke=a(e),z=s(e,"P",{"data-svelte-h":!0}),r(z)!=="svelte-omzti"&&(z.textContent=ft),Ie=a(e),O=s(e,"UL",{"data-svelte-h":!0}),r(O)!=="svelte-apzx5r"&&(O.innerHTML=dt),Re=a(e),h(B.$$.fragment,e),Ee=a(e),X=s(e,"P",{"data-svelte-h":!0}),r(X)!=="svelte-sywduw"&&(X.textContent=gt),Se=a(e),V=s(e,"P",{"data-svelte-h":!0}),r(V)!=="svelte-dnupoj"&&(V.innerHTML=ht),je=a(e),f=s(e,"UL",{});var ne=se(f);c=s(ne,"LI",{});var $=se(c);Q=s($,"P",{"data-svelte-h":!0}),r(Q)!=="svelte-14vukni"&&(Q.innerHTML=vt),Ne=a($),h(D.$$.fragment,$),We=a($),K=s($,"P",{"data-svelte-h":!0}),r(K)!=="svelte-ygfqn1"&&(K.textContent=_t),ze=a($),ee=s($,"UL",{"data-svelte-h":!0}),r(ee)!=="svelte-7cq89u"&&(ee.innerHTML=bt),$.forEach(l),Oe=a(ne),te=s(ne,"LI",{"data-svelte-h":!0}),r(te)!=="svelte-9ofzm8"&&(te.innerHTML=yt),Be=a(ne),le=s(ne,"LI",{"data-svelte-h":!0}),r(le)!=="svelte-xbgvsk"&&(le.innerHTML=Mt),ne.forEach(l),Fe=a(e),h(J.$$.fragment,e),qe=a(e),oe=s(e,"P",{}),se(oe).forEach(l),this.h()},h(){m(d,"name","hf:doc:metadata"),m(d,"content",Pt),Ze(R.src,lt="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/endpoints/llamacpp_1.png")\|\|m(R,"src",lt),m(R,"alt","Select model"),m(M,"start","2"),Ze(E.src,it="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/endpoints/llamacpp_2.png")\|\|m(E,"src",it),m(E,"alt","Select GGUF file"),m(L,"start","3"),Ze(S.src,ot="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/endpoints/llamacpp_3.png")\|\|m(S,"src",ot),m(S,"alt","Select hardware"),m(w,"start","4"),m(u,"width","1280"),m(u,"height","720"),Ze(u.src,mt="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/endpoints/llamacpp_guide_video.mp4")\|\|m(u,"src",mt),u.controls="true"},m(e,t){p(document.head,d),n(e,re,t),n(e,ae,t),n(e,me,t),v(T,e,t),n(e,pe,t),v(x,e,t),n(e,ce,t),n(e,C,t),n(e,ue,t),n(e,A,t),n(e,fe,t),n(e,G,t),n(e,de,t),n(e,H,t),n(e,ge,t),n(e,U,t),n(e,he,t),v(P,e,t),n(e,ve,t),n(e,k,t),n(e,_e,t),n(e,I,t),n(e,be,t),n(e,R,t),n(e,ye,t),n(e,M,t),n(e,Me,t),n(e,E,t),n(e,Le,t),n(e,L,t),n(e,we,t),n(e,S,t),n(e,$e,t),n(e,w,t),n(e,Te,t),n(e,j,t),n(e,xe,t),n(e,u,t),n(e,Ce,t),v(F,e,t),n(e,Ae,t),n(e,q,t),n(e,Ge,t),v(Y,e,t),n(e,He,t),n(e,Z,t),n(e,Ue,t),v(N,e,t),n(e,Pe,t),n(e,W,t),n(e,ke,t),n(e,z,t),n(e,Ie,t),n(e,O,t),n(e,Re,t),v(B,e,t),n(e,Ee,t),n(e,X,t),n(e,Se,t),n(e,V,t),n(e,je,t),n(e,f,t),p(f,c),p(c,Q),p(c,Ne),v(D,c,null),p(c,We),p(c,K),p(c,ze),p(c,ee),p(f,Oe),p(f,te),p(f,Be),p(f,le),n(e,Fe,t),v(J,e,t),n(e,qe,t),n(e,oe,t),Ye=!0},p:wt,i(e){Ye\|\|(_(T.$$.fragment,e),_(x.$$.fragment,e),_(P.$$.fragment,e),_(F.$$.fragment,e),_(Y.$$.fragment,e),_(N.$$.fragment,e),_(B.$$.fragment,e),_(D.$$.fragment,e),_(J.$$.fragment,e),Ye=!0)},o(e){b(T.$$.fragment,e),b(x.$$.fragment,e),b(P.$$.fragment,e),b(F.$$.fragment,e),b(Y.$$.fragment,e),b(N.$$.fragment,e),b(B.$$.fragment,e),b(D.$$.fragment,e),b(J.$$.fragment,e),Ye=!1},d(e){e&&(l(re),l(ae),l(me),l(pe),l(ce),l(C),l(ue),l(A),l(fe),l(G),l(de),l(H),l(ge),l(U),l(he),l(ve),l(k),l(_e),l(I),l(be),l(R),l(ye),l(M),l(Me),l(E),l(Le),l(L),l(we),l(S),l($e),l(w),l(Te),l(j),l(xe),l(u),l(Ce),l(Ae),l(q),l(Ge),l(He),l(Z),l(Ue),l(Pe),l(W),l(ke),l(z),l(Ie),l(O),l(Re),l(Ee),l(X),l(Se),l(V),l(je),l(f),l(Fe),l(qe),l(oe)),l(d),y(T,e),y(x,e),y(P,e),y(F,e),y(Y,e),y(N,e),y(B,e),y(D),y(J,e)}}}const Pt='{"title":"llama.cpp","local":"llamacpp","sections":[{"title":"Deployment Steps","local":"deployment-steps","sections":[],"depth":2},{"title":"Configurations","local":"configurations","sections":[{"title":"Basic Configurations","local":"basic-configurations","sections":[],"depth":3},{"title":"Advanced Configurations","local":"advanced-configurations","sections":[],"depth":3}],"depth":2},{"title":"Troubleshooting","local":"troubleshooting","sections":[],"depth":2}],"depth":1}';function kt(Xe){return $t(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class jt extends Tt{constructor(d){super(),xt(this,d,kt,Ut,Lt,{})}}export{jt as component};

Xet Storage Details

Size:: 14.5 kB
Xet hash:: b934accdf1bd95c6bfef47b2560433af839d1c99cbe2dca4bdd40a55d173518e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.