Buckets:

rtrm's picture
download
raw
101 kB
import{s as Ye,c as Le,u as He,g as De,d as Pe,e as Ke,A as Ie,o as ze,f as Oe,n as Gl}from"../chunks/scheduler.37c15a92.js";import{S as _e,i as Fe,g as w,s as o,h,j as Kl,f as e,c as i,k as Wl,a as t,d,t as C,z as lt,m as X,n as k,y as de,E as et,o as tt,r as g,A as st,u as B,x as f,v as G,w as Z}from"../chunks/index.7cb9c9b8.js";import{T as be}from"../chunks/Tip.d10b3fc9.js";import{C as W}from"../chunks/CodeBlock.abae2786.js";import{H as ul,E as nt}from"../chunks/getInferenceSnippets.f9350a3f.js";import{e as Re}from"../chunks/each.e59479a4.js";import{s as he}from"../chunks/stores.cb4752a8.js";function at(A,y){const r=new URL(window.location.href),U=new URLSearchParams(r.search);U.set(A,y),r.search=U.toString(),history.replaceState(null,"",r.toString())}function Mt(A){const y=new URL(window.location.href);return new URLSearchParams(y.search).get(A)}function xe(A,y,r){const U=A.slice();return U[7]=y[r],U}function qe(A){let y,r=A[7]+"",U,J,T,c,m;function I(){return A[6](A[7])}return{c(){y=w("div"),U=X(r),J=o(),this.h()},l(s){y=h(s,"DIV",{class:!0});var M=Kl(y);U=k(M,r),J=i(M),M.forEach(e),this.h()},h(){Wl(y,"class",T="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd "+(A[2][A[0]]===A[7]?"border-gray-800 bg-black dark:bg-gray-700 text-white":"text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm"))},m(s,M){t(s,y,M),de(y,U),de(y,J),c||(m=et(y,"click",I),c=!0)},p(s,M){A=s,M&2&&r!==(r=A[7]+"")&&tt(U,r),M&7&&T!==(T="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd "+(A[2][A[0]]===A[7]?"border-gray-800 bg-black dark:bg-gray-700 text-white":"text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm"))&&Wl(y,"class",T)},d(s){s&&e(y),c=!1,m()}}}function pt(A){let y,r,U,J,T=Re(A[1]),c=[];for(let s=0;s<T.length;s+=1)c[s]=qe(xe(A,T,s));const m=A[5].default,I=Le(m,A,A[4],null);return{c(){y=w("div");for(let s=0;s<c.length;s+=1)c[s].c();r=o(),U=w("div"),I&&I.c(),this.h()},l(s){y=h(s,"DIV",{class:!0});var M=Kl(y);for(let j=0;j<c.length;j+=1)c[j].l(M);M.forEach(e),r=i(s),U=h(s,"DIV",{class:!0});var Q=Kl(U);I&&I.l(Q),Q.forEach(e),this.h()},h(){Wl(y,"class","flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"),Wl(U,"class","language-select")},m(s,M){t(s,y,M);for(let Q=0;Q<c.length;Q+=1)c[Q]&&c[Q].m(y,null);t(s,r,M),t(s,U,M),I&&I.m(U,null),J=!0},p(s,[M]){if(M&15){T=Re(s[1]);let Q;for(Q=0;Q<T.length;Q+=1){const j=xe(s,T,Q);c[Q]?c[Q].p(j,M):(c[Q]=qe(j),c[Q].c(),c[Q].m(y,null))}for(;Q<c.length;Q+=1)c[Q].d(1);c.length=T.length}I&&I.p&&(!J||M&16)&&He(I,m,s,s[4],J?Pe(m,s[4],M,null):De(s[4]),null)},i(s){J||(d(I,s),J=!0)},o(s){C(I,s),J=!1},d(s){s&&(e(y),e(r),e(U)),lt(c,s),I&&I.d(s)}}}function ot(A,y,r){let U;Ke(A,he,M=>r(2,U=M));let{$$slots:J={},$$scope:T}=y,{id:c}=y,{options:m}=y;Ie(he,U[c]=m[0],U);function I(M){Ie(he,U[c]=M,U),at(c,M)}ze(()=>{const M=Mt(c);M&&m.includes(M)&&Ie(he,U[c]=M,U)});const s=M=>I(M);return A.$$set=M=>{"id"in M&&r(0,c=M.id),"options"in M&&r(1,m=M.options),"$$scope"in M&&r(4,T=M.$$scope)},[c,m,U,I,T,J,s]}class Dl extends _e{constructor(y){super(),Fe(this,y,ot,pt,Ye,{id:0,options:1})}}function it(A){let y,r='Flash Attention is a technique that optimizes the attention mechanism in transformer models by addressing memory bandwidth bottlenecks. As discussed earlier in <a href="/course/chapter1/8">Chapter 1.8</a>, the attention mechanism has quadratic complexity and memory usage, making it inefficient for long sequences.',U,J,T="The key innovation is in how it manages memory transfers between High Bandwidth Memory (HBM) and faster SRAM cache. Traditional attention repeatedly transfers data between HBM and SRAM, creating bottlenecks by leaving the GPU idle. Flash Attention loads data once into SRAM and performs all calculations there, minimizing expensive memory transfers.",c,m,I="While the benefits are most significant during training, Flash Attention’s reduced VRAM usage and improved efficiency make it valuable for inference as well, enabling faster and more scalable LLM serving.";return{c(){y=w("p"),y.innerHTML=r,U=o(),J=w("p"),J.textContent=T,c=o(),m=w("p"),m.textContent=I},l(s){y=h(s,"P",{"data-svelte-h":!0}),f(y)!=="svelte-1rkqssk"&&(y.innerHTML=r),U=i(s),J=h(s,"P",{"data-svelte-h":!0}),f(J)!=="svelte-jnwo3v"&&(J.textContent=T),c=i(s),m=h(s,"P",{"data-svelte-h":!0}),f(m)!=="svelte-9afbfq"&&(m.textContent=I)},m(s,M){t(s,y,M),t(s,U,M),t(s,J,M),t(s,c,M),t(s,m,M)},p:Gl,d(s){s&&(e(y),e(U),e(J),e(c),e(m))}}}function yt(A){let y,r='PagedAttention is a technique that addresses another critical bottleneck in LLM inference: KV cache memory management. As discussed in <a href="/course/chapter1/8">Chapter 1.8</a>, during text generation, the model stores attention keys and values (KV cache) for each generated token to reduce redundant computations. The KV cache can become enormous, especially with long sequences or multiple concurrent requests.',U,J,T="vLLM’s key innovation lies in how it manages this cache:",c,m,I="<li><strong>Memory Paging</strong>: Instead of treating the KV cache as one large block, it’s divided into fixed-size “pages” (similar to virtual memory in operating systems).</li> <li><strong>Non-contiguous Storage</strong>: Pages don’t need to be stored contiguously in GPU memory, allowing for more flexible memory allocation.</li> <li><strong>Page Table Management</strong>: A page table tracks which pages belong to which sequence, enabling efficient lookup and access.</li> <li><strong>Memory Sharing</strong>: For operations like parallel sampling, pages storing the KV cache for the prompt can be shared across multiple sequences.</li>",s,M,Q='The PagedAttention approach can lead to up to 24x higher throughput compared to traditional methods, making it a game-changer for production LLM deployments. If you want to go really deep into how PagedAttention works, you can read the <a href="https://docs.vllm.ai/en/latest/design/kernel/paged_attention.html" rel="nofollow">the guide from the vLLM documentation</a>.';return{c(){y=w("p"),y.innerHTML=r,U=o(),J=w("p"),J.textContent=T,c=o(),m=w("ol"),m.innerHTML=I,s=o(),M=w("p"),M.innerHTML=Q},l(j){y=h(j,"P",{"data-svelte-h":!0}),f(y)!=="svelte-121sini"&&(y.innerHTML=r),U=i(j),J=h(j,"P",{"data-svelte-h":!0}),f(J)!=="svelte-1dh20ya"&&(J.textContent=T),c=i(j),m=h(j,"OL",{"data-svelte-h":!0}),f(m)!=="svelte-af7t9r"&&(m.innerHTML=I),s=i(j),M=h(j,"P",{"data-svelte-h":!0}),f(M)!=="svelte-15futu5"&&(M.innerHTML=Q)},m(j,V){t(j,y,V),t(j,U,V),t(j,J,V),t(j,c,V),t(j,m,V),t(j,s,V),t(j,M,V)},p:Gl,d(j){j&&(e(y),e(U),e(J),e(c),e(m),e(s),e(M))}}}function ct(A){let y,r="Quantization in llama.cpp reduces the precision of model weights from 32-bit or 16-bit floating point to lower precision formats like 8-bit integers (INT8), 4-bit, or even lower. This significantly reduces memory usage and improves inference speed with minimal quality loss.",U,J,T="Key quantization features in llama.cpp include:",c,m,I="<li><strong>Multiple Quantization Levels</strong>: Supports 8-bit, 4-bit, 3-bit, and even 2-bit quantization</li> <li><strong>GGML/GGUF Format</strong>: Uses custom tensor formats optimized for quantized inference</li> <li><strong>Mixed Precision</strong>: Can apply different quantization levels to different parts of the model</li> <li><strong>Hardware-Specific Optimizations</strong>: Includes optimized code paths for various CPU architectures (AVX2, AVX-512, NEON)</li>",s,M,Q="This approach enables running billion-parameter models on consumer hardware with limited memory, making it perfect for local deployments and edge devices.";return{c(){y=w("p"),y.textContent=r,U=o(),J=w("p"),J.textContent=T,c=o(),m=w("ol"),m.innerHTML=I,s=o(),M=w("p"),M.textContent=Q},l(j){y=h(j,"P",{"data-svelte-h":!0}),f(y)!=="svelte-7sflxc"&&(y.textContent=r),U=i(j),J=h(j,"P",{"data-svelte-h":!0}),f(J)!=="svelte-wl8vzm"&&(J.textContent=T),c=i(j),m=h(j,"OL",{"data-svelte-h":!0}),f(m)!=="svelte-m0ejls"&&(m.innerHTML=I),s=i(j),M=h(j,"P",{"data-svelte-h":!0}),f(M)!=="svelte-gau5ug"&&(M.textContent=Q)},m(j,V){t(j,y,V),t(j,U,V),t(j,J,V),t(j,c,V),t(j,m,V),t(j,s,V),t(j,M,V)},p:Gl,d(j){j&&(e(y),e(U),e(J),e(c),e(m),e(s),e(M))}}}function rt(A){let y,r,U="TGI is easy to install and use, with deep integration into the Hugging Face ecosystem.",J,T,c="First, launch the TGI server using Docker:",m,I,s,M,Q="Then interact with it using Hugging Face’s InferenceClient:",j,V,H,S,cl="Alternatively, you can use the OpenAI client:",N,x,D,E,rl="llama.cpp is easy to install and use, requiring minimal dependencies and supporting both CPU and GPU inference.",R,u,$="First, install and build llama.cpp:",el,q,tl,P,bl="Then, launch the server (with OpenAI API compatibility):",sl,Y,nl,K,dl="Interact with the server using Hugging Face’s InferenceClient:",al,z,Ml,O,jl="Alternatively, you can use the OpenAI client:",_,Jl,pl,ll,Cl="vLLM is easy to install and use, with both OpenAI API compatibility and a native Python interface.",ol,F,Tl="First, launch the vLLM OpenAI-compatible server:",wl,L,ml,il,Ul="Then interact with it using Hugging Face’s InferenceClient:",a,b,Sl,hl,fl="Alternatively, you can use the OpenAI client:",gl,yl,Bl,Il;return I=new W({props:{code:"ZG9ja2VyJTIwcnVuJTIwLS1ncHVzJTIwYWxsJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1zaG0tc2l6ZSUyMDFnJTIwJTVDJTBBJTIwJTIwJTIwJTIwLXAlMjA4MDgwJTNBODAlMjAlNUMlMEElMjAlMjAlMjAlMjAtdiUyMH4lMkYuY2FjaGUlMkZodWdnaW5nZmFjZSUzQSUyRmRhdGElMjAlNUMlMEElMjAlMjAlMjAlMjBnaGNyLmlvJTJGaHVnZ2luZ2ZhY2UlMkZ0ZXh0LWdlbmVyYXRpb24taW5mZXJlbmNlJTNBbGF0ZXN0JTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tb2RlbC1pZCUyMEh1Z2dpbmdGYWNlVEIlMkZTbW9sTE0yLTM2ME0tSW5zdHJ1Y3Q=",highlighted:`docker run --gpus all \\
--shm-size 1g \\
-p 8080:80 \\
-v ~/.cache/huggingface:/data \\
ghcr.io/huggingface/text-generation-inference:latest \\
--model-id HuggingFaceTB/SmolLM2-360M-Instruct`,wrap:!1}}),V=new W({props:{code:"ZnJvbSUyMGh1Z2dpbmdmYWNlX2h1YiUyMGltcG9ydCUyMEluZmVyZW5jZUNsaWVudCUwQSUwQSUyMyUyMEluaXRpYWxpemUlMjBjbGllbnQlMjBwb2ludGluZyUyMHRvJTIwVEdJJTIwZW5kcG9pbnQlMEFjbGllbnQlMjAlM0QlMjBJbmZlcmVuY2VDbGllbnQoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJodHRwJTNBJTJGJTJGbG9jYWxob3N0JTNBODA4MCUyMiUyQyUyMCUyMCUyMyUyMFVSTCUyMHRvJTIwdGhlJTIwVEdJJTIwc2VydmVyJTBBKSUwQSUwQSUyMyUyMFRleHQlMjBnZW5lcmF0aW9uJTBBcmVzcG9uc2UlMjAlM0QlMjBjbGllbnQudGV4dF9nZW5lcmF0aW9uKCUwQSUyMCUyMCUyMCUyMCUyMlRlbGwlMjBtZSUyMGElMjBzdG9yeSUyMiUyQyUwQSUyMCUyMCUyMCUyMG1heF9uZXdfdG9rZW5zJTNEMTAwJTJDJTBBJTIwJTIwJTIwJTIwdGVtcGVyYXR1cmUlM0QwLjclMkMlMEElMjAlMjAlMjAlMjB0b3BfcCUzRDAuOTUlMkMlMEElMjAlMjAlMjAlMjBkZXRhaWxzJTNEVHJ1ZSUyQyUwQSUyMCUyMCUyMCUyMHN0b3Bfc2VxdWVuY2VzJTNEJTVCJTVEJTJDJTBBKSUwQXByaW50KHJlc3BvbnNlLmdlbmVyYXRlZF90ZXh0KSUwQSUwQSUyMyUyMEZvciUyMGNoYXQlMjBmb3JtYXQlMEFyZXNwb25zZSUyMCUzRCUyMGNsaWVudC5jaGF0X2NvbXBsZXRpb24oJTBBJTIwJTIwJTIwJTIwbWVzc2FnZXMlM0QlNUIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlN0IlMjJyb2xlJTIyJTNBJTIwJTIyc3lzdGVtJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMllvdSUyMGFyZSUyMGElMjBoZWxwZnVsJTIwYXNzaXN0YW50LiUyMiU3RCUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJ1c2VyJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMlRlbGwlMjBtZSUyMGElMjBzdG9yeSUyMiU3RCUyQyUwQSUyMCUyMCUyMCUyMCU1RCUyQyUwQSUyMCUyMCUyMCUyMG1heF90b2tlbnMlM0QxMDAlMkMlMEElMjAlMjAlMjAlMjB0ZW1wZXJhdHVyZSUzRDAuNyUyQyUwQSUyMCUyMCUyMCUyMHRvcF9wJTNEMC45NSUyQyUwQSklMEFwcmludChyZXNwb25zZS5jaG9pY2VzJTVCMCU1RC5tZXNzYWdlLmNvbnRlbnQp",highlighted:`<span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
<span class="hljs-comment"># Initialize client pointing to TGI endpoint</span>
client = InferenceClient(
model=<span class="hljs-string">&quot;http://localhost:8080&quot;</span>, <span class="hljs-comment"># URL to the TGI server</span>
)
<span class="hljs-comment"># Text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Tell me a story&quot;</span>,
max_new_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
details=<span class="hljs-literal">True</span>,
stop_sequences=[],
)
<span class="hljs-built_in">print</span>(response.generated_text)
<span class="hljs-comment"># For chat format</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)`,wrap:!1}}),x=new W({props:{code:"ZnJvbSUyMG9wZW5haSUyMGltcG9ydCUyME9wZW5BSSUwQSUwQSUyMyUyMEluaXRpYWxpemUlMjBjbGllbnQlMjBwb2ludGluZyUyMHRvJTIwVEdJJTIwZW5kcG9pbnQlMEFjbGllbnQlMjAlM0QlMjBPcGVuQUkoJTBBJTIwJTIwJTIwJTIwYmFzZV91cmwlM0QlMjJodHRwJTNBJTJGJTJGbG9jYWxob3N0JTNBODA4MCUyRnYxJTIyJTJDJTIwJTIwJTIzJTIwTWFrZSUyMHN1cmUlMjB0byUyMGluY2x1ZGUlMjAlMkZ2MSUwQSUyMCUyMCUyMCUyMGFwaV9rZXklM0QlMjJub3QtbmVlZGVkJTIyJTJDJTIwJTIwJTIzJTIwVEdJJTIwZG9lc24ndCUyMHJlcXVpcmUlMjBhbiUyMEFQSSUyMGtleSUyMGJ5JTIwZGVmYXVsdCUwQSklMEElMEElMjMlMjBDaGF0JTIwY29tcGxldGlvbiUwQXJlc3BvbnNlJTIwJTNEJTIwY2xpZW50LmNoYXQuY29tcGxldGlvbnMuY3JlYXRlKCUwQSUyMCUyMCUyMCUyMG1vZGVsJTNEJTIySHVnZ2luZ0ZhY2VUQiUyRlNtb2xMTTItMzYwTS1JbnN0cnVjdCUyMiUyQyUwQSUyMCUyMCUyMCUyMG1lc3NhZ2VzJTNEJTVCJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnN5c3RlbSUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJZb3UlMjBhcmUlMjBhJTIwaGVscGZ1bCUyMGFzc2lzdGFudC4lMjIlN0QlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlN0IlMjJyb2xlJTIyJTNBJTIwJTIydXNlciUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJUZWxsJTIwbWUlMjBhJTIwc3RvcnklMjIlN0QlMkMlMEElMjAlMjAlMjAlMjAlNUQlMkMlMEElMjAlMjAlMjAlMjBtYXhfdG9rZW5zJTNEMTAwJTJDJTBBJTIwJTIwJTIwJTIwdGVtcGVyYXR1cmUlM0QwLjclMkMlMEElMjAlMjAlMjAlMjB0b3BfcCUzRDAuOTUlMkMlMEEpJTBBcHJpbnQocmVzcG9uc2UuY2hvaWNlcyU1QjAlNUQubWVzc2FnZS5jb250ZW50KQ==",highlighted:`<span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
<span class="hljs-comment"># Initialize client pointing to TGI endpoint</span>
client = OpenAI(
base_url=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>, <span class="hljs-comment"># Make sure to include /v1</span>
api_key=<span class="hljs-string">&quot;not-needed&quot;</span>, <span class="hljs-comment"># TGI doesn&#x27;t require an API key by default</span>
)
<span class="hljs-comment"># Chat completion</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-360M-Instruct&quot;</span>,
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)`,wrap:!1}}),q=new W({props:{code:"JTIzJTIwQ2xvbmUlMjB0aGUlMjByZXBvc2l0b3J5JTBBZ2l0JTIwY2xvbmUlMjBodHRwcyUzQSUyRiUyRmdpdGh1Yi5jb20lMkZnZ2VyZ2Fub3YlMkZsbGFtYS5jcHAlMEFjZCUyMGxsYW1hLmNwcCUwQSUwQSUyMyUyMEJ1aWxkJTIwdGhlJTIwcHJvamVjdCUwQW1ha2UlMEElMEElMjMlMjBEb3dubG9hZCUyMHRoZSUyMFNtb2xMTTItMS43Qi1JbnN0cnVjdC1HR1VGJTIwbW9kZWwlMEFjdXJsJTIwLUwlMjAtTyUyMGh0dHBzJTNBJTJGJTJGaHVnZ2luZ2ZhY2UuY28lMkZIdWdnaW5nRmFjZVRCJTJGU21vbExNMi0xLjdCLUluc3RydWN0LUdHVUYlMkZyZXNvbHZlJTJGbWFpbiUyRnNtb2xsbTItMS43Yi1pbnN0cnVjdC5RNF9LX00uZ2d1Zg==",highlighted:`<span class="hljs-comment"># Clone the repository</span>
git <span class="hljs-built_in">clone</span> https://github.com/ggerganov/llama.cpp
<span class="hljs-built_in">cd</span> llama.cpp
<span class="hljs-comment"># Build the project</span>
make
<span class="hljs-comment"># Download the SmolLM2-1.7B-Instruct-GGUF model</span>
curl -L -O https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF/resolve/main/smollm2-1.7b-instruct.Q4_K_M.gguf`,wrap:!1}}),Y=new W({props:{code:"JTIzJTIwU3RhcnQlMjB0aGUlMjBzZXJ2ZXIlMEEuJTJGc2VydmVyJTIwJTVDJTBBJTIwJTIwJTIwJTIwLW0lMjBzbW9sbG0yLTEuN2ItaW5zdHJ1Y3QuUTRfS19NLmdndWYlMjAlNUMlMEElMjAlMjAlMjAlMjAtLWhvc3QlMjAwLjAuMC4wJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1wb3J0JTIwODA4MCUyMCU1QyUwQSUyMCUyMCUyMCUyMC1jJTIwNDA5NiUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tbi1ncHUtbGF5ZXJzJTIwMCUyMCUyMCUyMyUyMFNldCUyMHRvJTIwYSUyMGhpZ2hlciUyMG51bWJlciUyMHRvJTIwdXNlJTIwR1BV",highlighted:`<span class="hljs-comment"># Start the server</span>
./server \\
-m smollm2-1.7b-instruct.Q4_K_M.gguf \\
--host 0.0.0.0 \\
--port 8080 \\
-c 4096 \\
--n-gpu-layers 0 <span class="hljs-comment"># Set to a higher number to use GPU</span>`,wrap:!1}}),z=new W({props:{code:"ZnJvbSUyMGh1Z2dpbmdmYWNlX2h1YiUyMGltcG9ydCUyMEluZmVyZW5jZUNsaWVudCUwQSUwQSUyMyUyMEluaXRpYWxpemUlMjBjbGllbnQlMjBwb2ludGluZyUyMHRvJTIwbGxhbWEuY3BwJTIwc2VydmVyJTBBY2xpZW50JTIwJTNEJTIwSW5mZXJlbmNlQ2xpZW50KCUwQSUyMCUyMCUyMCUyMG1vZGVsJTNEJTIyaHR0cCUzQSUyRiUyRmxvY2FsaG9zdCUzQTgwODAlMkZ2MSUyMiUyQyUyMCUyMCUyMyUyMFVSTCUyMHRvJTIwdGhlJTIwbGxhbWEuY3BwJTIwc2VydmVyJTBBJTIwJTIwJTIwJTIwdG9rZW4lM0QlMjJzay1uby1rZXktcmVxdWlyZWQlMjIlMkMlMjAlMjAlMjMlMjBsbGFtYS5jcHAlMjBzZXJ2ZXIlMjByZXF1aXJlcyUyMHRoaXMlMjBwbGFjZWhvbGRlciUwQSklMEElMEElMjMlMjBUZXh0JTIwZ2VuZXJhdGlvbiUwQXJlc3BvbnNlJTIwJTNEJTIwY2xpZW50LnRleHRfZ2VuZXJhdGlvbiglMEElMjAlMjAlMjAlMjAlMjJUZWxsJTIwbWUlMjBhJTIwc3RvcnklMjIlMkMlMEElMjAlMjAlMjAlMjBtYXhfbmV3X3Rva2VucyUzRDEwMCUyQyUwQSUyMCUyMCUyMCUyMHRlbXBlcmF0dXJlJTNEMC43JTJDJTBBJTIwJTIwJTIwJTIwdG9wX3AlM0QwLjk1JTJDJTBBJTIwJTIwJTIwJTIwZGV0YWlscyUzRFRydWUlMkMlMEEpJTBBcHJpbnQocmVzcG9uc2UuZ2VuZXJhdGVkX3RleHQpJTBBJTBBJTIzJTIwRm9yJTIwY2hhdCUyMGZvcm1hdCUwQXJlc3BvbnNlJTIwJTNEJTIwY2xpZW50LmNoYXRfY29tcGxldGlvbiglMEElMjAlMjAlMjAlMjBtZXNzYWdlcyUzRCU1QiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJzeXN0ZW0lMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyWW91JTIwYXJlJTIwYSUyMGhlbHBmdWwlMjBhc3Npc3RhbnQuJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyVGVsbCUyMG1lJTIwYSUyMHN0b3J5JTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTVEJTJDJTBBJTIwJTIwJTIwJTIwbWF4X3Rva2VucyUzRDEwMCUyQyUwQSUyMCUyMCUyMCUyMHRlbXBlcmF0dXJlJTNEMC43JTJDJTBBJTIwJTIwJTIwJTIwdG9wX3AlM0QwLjk1JTJDJTBBKSUwQXByaW50KHJlc3BvbnNlLmNob2ljZXMlNUIwJTVELm1lc3NhZ2UuY29udGVudCk=",highlighted:`<span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
<span class="hljs-comment"># Initialize client pointing to llama.cpp server</span>
client = InferenceClient(
model=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>, <span class="hljs-comment"># URL to the llama.cpp server</span>
token=<span class="hljs-string">&quot;sk-no-key-required&quot;</span>, <span class="hljs-comment"># llama.cpp server requires this placeholder</span>
)
<span class="hljs-comment"># Text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Tell me a story&quot;</span>,
max_new_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
details=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">print</span>(response.generated_text)
<span class="hljs-comment"># For chat format</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)`,wrap:!1}}),Jl=new W({props:{code:"ZnJvbSUyMG9wZW5haSUyMGltcG9ydCUyME9wZW5BSSUwQSUwQSUyMyUyMEluaXRpYWxpemUlMjBjbGllbnQlMjBwb2ludGluZyUyMHRvJTIwbGxhbWEuY3BwJTIwc2VydmVyJTBBY2xpZW50JTIwJTNEJTIwT3BlbkFJKCUwQSUyMCUyMCUyMCUyMGJhc2VfdXJsJTNEJTIyaHR0cCUzQSUyRiUyRmxvY2FsaG9zdCUzQTgwODAlMkZ2MSUyMiUyQyUwQSUyMCUyMCUyMCUyMGFwaV9rZXklM0QlMjJzay1uby1rZXktcmVxdWlyZWQlMjIlMkMlMjAlMjAlMjMlMjBsbGFtYS5jcHAlMjBzZXJ2ZXIlMjByZXF1aXJlcyUyMHRoaXMlMjBwbGFjZWhvbGRlciUwQSklMEElMEElMjMlMjBDaGF0JTIwY29tcGxldGlvbiUwQXJlc3BvbnNlJTIwJTNEJTIwY2xpZW50LmNoYXQuY29tcGxldGlvbnMuY3JlYXRlKCUwQSUyMCUyMCUyMCUyMG1vZGVsJTNEJTIyc21vbGxtMi0xLjdiLWluc3RydWN0JTIyJTJDJTIwJTIwJTIzJTIwTW9kZWwlMjBpZGVudGlmaWVyJTIwY2FuJTIwYmUlMjBhbnl0aGluZyUyMGFzJTIwc2VydmVyJTIwb25seSUyMGxvYWRzJTIwb25lJTIwbW9kZWwlMEElMjAlMjAlMjAlMjBtZXNzYWdlcyUzRCU1QiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJzeXN0ZW0lMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyWW91JTIwYXJlJTIwYSUyMGhlbHBmdWwlMjBhc3Npc3RhbnQuJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyVGVsbCUyMG1lJTIwYSUyMHN0b3J5JTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTVEJTJDJTBBJTIwJTIwJTIwJTIwbWF4X3Rva2VucyUzRDEwMCUyQyUwQSUyMCUyMCUyMCUyMHRlbXBlcmF0dXJlJTNEMC43JTJDJTBBJTIwJTIwJTIwJTIwdG9wX3AlM0QwLjk1JTJDJTBBKSUwQXByaW50KHJlc3BvbnNlLmNob2ljZXMlNUIwJTVELm1lc3NhZ2UuY29udGVudCk=",highlighted:`<span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
<span class="hljs-comment"># Initialize client pointing to llama.cpp server</span>
client = OpenAI(
base_url=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>,
api_key=<span class="hljs-string">&quot;sk-no-key-required&quot;</span>, <span class="hljs-comment"># llama.cpp server requires this placeholder</span>
)
<span class="hljs-comment"># Chat completion</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;smollm2-1.7b-instruct&quot;</span>, <span class="hljs-comment"># Model identifier can be anything as server only loads one model</span>
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)`,wrap:!1}}),L=new W({props:{code:"cHl0aG9uJTIwLW0lMjB2bGxtLmVudHJ5cG9pbnRzLm9wZW5haS5hcGlfc2VydmVyJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tb2RlbCUyMEh1Z2dpbmdGYWNlVEIlMkZTbW9sTE0yLTM2ME0tSW5zdHJ1Y3QlMjAlNUMlMEElMjAlMjAlMjAlMjAtLWhvc3QlMjAwLjAuMC4wJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1wb3J0JTIwODAwMA==",highlighted:`python -m vllm.entrypoints.openai.api_server \\
--model HuggingFaceTB/SmolLM2-360M-Instruct \\
--host 0.0.0.0 \\
--port 8000`,wrap:!1}}),b=new W({props:{code:"ZnJvbSUyMGh1Z2dpbmdmYWNlX2h1YiUyMGltcG9ydCUyMEluZmVyZW5jZUNsaWVudCUwQSUwQSUyMyUyMEluaXRpYWxpemUlMjBjbGllbnQlMjBwb2ludGluZyUyMHRvJTIwdkxMTSUyMGVuZHBvaW50JTBBY2xpZW50JTIwJTNEJTIwSW5mZXJlbmNlQ2xpZW50KCUwQSUyMCUyMCUyMCUyMG1vZGVsJTNEJTIyaHR0cCUzQSUyRiUyRmxvY2FsaG9zdCUzQTgwMDAlMkZ2MSUyMiUyQyUyMCUyMCUyMyUyMFVSTCUyMHRvJTIwdGhlJTIwdkxMTSUyMHNlcnZlciUwQSklMEElMEElMjMlMjBUZXh0JTIwZ2VuZXJhdGlvbiUwQXJlc3BvbnNlJTIwJTNEJTIwY2xpZW50LnRleHRfZ2VuZXJhdGlvbiglMEElMjAlMjAlMjAlMjAlMjJUZWxsJTIwbWUlMjBhJTIwc3RvcnklMjIlMkMlMEElMjAlMjAlMjAlMjBtYXhfbmV3X3Rva2VucyUzRDEwMCUyQyUwQSUyMCUyMCUyMCUyMHRlbXBlcmF0dXJlJTNEMC43JTJDJTBBJTIwJTIwJTIwJTIwdG9wX3AlM0QwLjk1JTJDJTBBJTIwJTIwJTIwJTIwZGV0YWlscyUzRFRydWUlMkMlMEEpJTBBcHJpbnQocmVzcG9uc2UuZ2VuZXJhdGVkX3RleHQpJTBBJTBBJTIzJTIwRm9yJTIwY2hhdCUyMGZvcm1hdCUwQXJlc3BvbnNlJTIwJTNEJTIwY2xpZW50LmNoYXRfY29tcGxldGlvbiglMEElMjAlMjAlMjAlMjBtZXNzYWdlcyUzRCU1QiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJzeXN0ZW0lMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyWW91JTIwYXJlJTIwYSUyMGhlbHBmdWwlMjBhc3Npc3RhbnQuJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyVGVsbCUyMG1lJTIwYSUyMHN0b3J5JTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTVEJTJDJTBBJTIwJTIwJTIwJTIwbWF4X3Rva2VucyUzRDEwMCUyQyUwQSUyMCUyMCUyMCUyMHRlbXBlcmF0dXJlJTNEMC43JTJDJTBBJTIwJTIwJTIwJTIwdG9wX3AlM0QwLjk1JTJDJTBBKSUwQXByaW50KHJlc3BvbnNlLmNob2ljZXMlNUIwJTVELm1lc3NhZ2UuY29udGVudCk=",highlighted:`<span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
<span class="hljs-comment"># Initialize client pointing to vLLM endpoint</span>
client = InferenceClient(
model=<span class="hljs-string">&quot;http://localhost:8000/v1&quot;</span>, <span class="hljs-comment"># URL to the vLLM server</span>
)
<span class="hljs-comment"># Text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Tell me a story&quot;</span>,
max_new_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
details=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">print</span>(response.generated_text)
<span class="hljs-comment"># For chat format</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)`,wrap:!1}}),yl=new W({props:{code:"ZnJvbSUyMG9wZW5haSUyMGltcG9ydCUyME9wZW5BSSUwQSUwQSUyMyUyMEluaXRpYWxpemUlMjBjbGllbnQlMjBwb2ludGluZyUyMHRvJTIwdkxMTSUyMGVuZHBvaW50JTBBY2xpZW50JTIwJTNEJTIwT3BlbkFJKCUwQSUyMCUyMCUyMCUyMGJhc2VfdXJsJTNEJTIyaHR0cCUzQSUyRiUyRmxvY2FsaG9zdCUzQTgwMDAlMkZ2MSUyMiUyQyUwQSUyMCUyMCUyMCUyMGFwaV9rZXklM0QlMjJub3QtbmVlZGVkJTIyJTJDJTIwJTIwJTIzJTIwdkxMTSUyMGRvZXNuJ3QlMjByZXF1aXJlJTIwYW4lMjBBUEklMjBrZXklMjBieSUyMGRlZmF1bHQlMEEpJTBBJTBBJTIzJTIwQ2hhdCUyMGNvbXBsZXRpb24lMEFyZXNwb25zZSUyMCUzRCUyMGNsaWVudC5jaGF0LmNvbXBsZXRpb25zLmNyZWF0ZSglMEElMjAlMjAlMjAlMjBtb2RlbCUzRCUyMkh1Z2dpbmdGYWNlVEIlMkZTbW9sTE0yLTM2ME0tSW5zdHJ1Y3QlMjIlMkMlMEElMjAlMjAlMjAlMjBtZXNzYWdlcyUzRCU1QiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJzeXN0ZW0lMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyWW91JTIwYXJlJTIwYSUyMGhlbHBmdWwlMjBhc3Npc3RhbnQuJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyVGVsbCUyMG1lJTIwYSUyMHN0b3J5JTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTVEJTJDJTBBJTIwJTIwJTIwJTIwbWF4X3Rva2VucyUzRDEwMCUyQyUwQSUyMCUyMCUyMCUyMHRlbXBlcmF0dXJlJTNEMC43JTJDJTBBJTIwJTIwJTIwJTIwdG9wX3AlM0QwLjk1JTJDJTBBKSUwQXByaW50KHJlc3BvbnNlLmNob2ljZXMlNUIwJTVELm1lc3NhZ2UuY29udGVudCk=",highlighted:`<span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
<span class="hljs-comment"># Initialize client pointing to vLLM endpoint</span>
client = OpenAI(
base_url=<span class="hljs-string">&quot;http://localhost:8000/v1&quot;</span>,
api_key=<span class="hljs-string">&quot;not-needed&quot;</span>, <span class="hljs-comment"># vLLM doesn&#x27;t require an API key by default</span>
)
<span class="hljs-comment"># Chat completion</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-360M-Instruct&quot;</span>,
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)`,wrap:!1}}),{c(){y=X(`<hfoption value="tgi" label="TGI">
`),r=w("p"),r.textContent=U,J=o(),T=w("p"),T.textContent=c,m=o(),g(I.$$.fragment),s=o(),M=w("p"),M.textContent=Q,j=o(),g(V.$$.fragment),H=o(),S=w("p"),S.textContent=cl,N=o(),g(x.$$.fragment),D=X(`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),E=w("p"),E.textContent=rl,R=o(),u=w("p"),u.textContent=$,el=o(),g(q.$$.fragment),tl=o(),P=w("p"),P.textContent=bl,sl=o(),g(Y.$$.fragment),nl=o(),K=w("p"),K.textContent=dl,al=o(),g(z.$$.fragment),Ml=o(),O=w("p"),O.textContent=jl,_=o(),g(Jl.$$.fragment),pl=X(`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),ll=w("p"),ll.textContent=Cl,ol=o(),F=w("p"),F.textContent=Tl,wl=o(),g(L.$$.fragment),ml=o(),il=w("p"),il.textContent=Ul,a=o(),g(b.$$.fragment),Sl=o(),hl=w("p"),hl.textContent=fl,gl=o(),g(yl.$$.fragment),Bl=X(`
</hfoption>`)},l(n){y=k(n,`<hfoption value="tgi" label="TGI">
`),r=h(n,"P",{"data-svelte-h":!0}),f(r)!=="svelte-1xscn0n"&&(r.textContent=U),J=i(n),T=h(n,"P",{"data-svelte-h":!0}),f(T)!=="svelte-1g20rmd"&&(T.textContent=c),m=i(n),B(I.$$.fragment,n),s=i(n),M=h(n,"P",{"data-svelte-h":!0}),f(M)!=="svelte-168g8u0"&&(M.textContent=Q),j=i(n),B(V.$$.fragment,n),H=i(n),S=h(n,"P",{"data-svelte-h":!0}),f(S)!=="svelte-1vrpxju"&&(S.textContent=cl),N=i(n),B(x.$$.fragment,n),D=k(n,`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),E=h(n,"P",{"data-svelte-h":!0}),f(E)!=="svelte-1uzfhnr"&&(E.textContent=rl),R=i(n),u=h(n,"P",{"data-svelte-h":!0}),f(u)!=="svelte-1xrr3cs"&&(u.textContent=$),el=i(n),B(q.$$.fragment,n),tl=i(n),P=h(n,"P",{"data-svelte-h":!0}),f(P)!=="svelte-9xyw1x"&&(P.textContent=bl),sl=i(n),B(Y.$$.fragment,n),nl=i(n),K=h(n,"P",{"data-svelte-h":!0}),f(K)!=="svelte-5g6wzi"&&(K.textContent=dl),al=i(n),B(z.$$.fragment,n),Ml=i(n),O=h(n,"P",{"data-svelte-h":!0}),f(O)!=="svelte-1vrpxju"&&(O.textContent=jl),_=i(n),B(Jl.$$.fragment,n),pl=k(n,`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),ll=h(n,"P",{"data-svelte-h":!0}),f(ll)!=="svelte-1sx76te"&&(ll.textContent=Cl),ol=i(n),F=h(n,"P",{"data-svelte-h":!0}),f(F)!=="svelte-9ezhmp"&&(F.textContent=Tl),wl=i(n),B(L.$$.fragment,n),ml=i(n),il=h(n,"P",{"data-svelte-h":!0}),f(il)!=="svelte-168g8u0"&&(il.textContent=Ul),a=i(n),B(b.$$.fragment,n),Sl=i(n),hl=h(n,"P",{"data-svelte-h":!0}),f(hl)!=="svelte-1vrpxju"&&(hl.textContent=fl),gl=i(n),B(yl.$$.fragment,n),Bl=k(n,`
</hfoption>`)},m(n,v){t(n,y,v),t(n,r,v),t(n,J,v),t(n,T,v),t(n,m,v),G(I,n,v),t(n,s,v),t(n,M,v),t(n,j,v),G(V,n,v),t(n,H,v),t(n,S,v),t(n,N,v),G(x,n,v),t(n,D,v),t(n,E,v),t(n,R,v),t(n,u,v),t(n,el,v),G(q,n,v),t(n,tl,v),t(n,P,v),t(n,sl,v),G(Y,n,v),t(n,nl,v),t(n,K,v),t(n,al,v),G(z,n,v),t(n,Ml,v),t(n,O,v),t(n,_,v),G(Jl,n,v),t(n,pl,v),t(n,ll,v),t(n,ol,v),t(n,F,v),t(n,wl,v),G(L,n,v),t(n,ml,v),t(n,il,v),t(n,a,v),G(b,n,v),t(n,Sl,v),t(n,hl,v),t(n,gl,v),G(yl,n,v),t(n,Bl,v),Il=!0},p:Gl,i(n){Il||(d(I.$$.fragment,n),d(V.$$.fragment,n),d(x.$$.fragment,n),d(q.$$.fragment,n),d(Y.$$.fragment,n),d(z.$$.fragment,n),d(Jl.$$.fragment,n),d(L.$$.fragment,n),d(b.$$.fragment,n),d(yl.$$.fragment,n),Il=!0)},o(n){C(I.$$.fragment,n),C(V.$$.fragment,n),C(x.$$.fragment,n),C(q.$$.fragment,n),C(Y.$$.fragment,n),C(z.$$.fragment,n),C(Jl.$$.fragment,n),C(L.$$.fragment,n),C(b.$$.fragment,n),C(yl.$$.fragment,n),Il=!1},d(n){n&&(e(y),e(r),e(J),e(T),e(m),e(s),e(M),e(j),e(H),e(S),e(N),e(D),e(E),e(R),e(u),e(el),e(tl),e(P),e(sl),e(nl),e(K),e(al),e(Ml),e(O),e(_),e(pl),e(ll),e(ol),e(F),e(wl),e(ml),e(il),e(a),e(Sl),e(hl),e(gl),e(Bl)),Z(I,n),Z(V,n),Z(x,n),Z(q,n),Z(Y,n),Z(z,n),Z(Jl,n),Z(L,n),Z(b,n),Z(yl,n)}}}function Jt(A){let y,r,U="First, deploy TGI with advanced parameters:",J,T,c,m,I="Use the InferenceClient for flexible text generation:",s,M,Q,j,V="Or use the OpenAI client:",H,S,cl,N,x="For llama.cpp, you can set advanced parameters when launching the server:",D,E,rl,R,u="Use the InferenceClient:",$,el,q,tl,P="Or use the OpenAI client for generation with control over the sampling parameters:",bl,sl,Y,nl,K="You can also use llama.cpp’s native library for even more control:",dl,al,z,Ml,O="For advanced usage with vLLM, you can use the InferenceClient:",jl,_,Jl,pl,ll="You can also use the OpenAI client:",Cl,ol,F,Tl,wl="vLLM also provides a native Python interface with fine-grained control:",L,ml,il,Ul;return T=new W({props:{code:"ZG9ja2VyJTIwcnVuJTIwLS1ncHVzJTIwYWxsJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1zaG0tc2l6ZSUyMDFnJTIwJTVDJTBBJTIwJTIwJTIwJTIwLXAlMjA4MDgwJTNBODAlMjAlNUMlMEElMjAlMjAlMjAlMjAtdiUyMH4lMkYuY2FjaGUlMkZodWdnaW5nZmFjZSUzQSUyRmRhdGElMjAlNUMlMEElMjAlMjAlMjAlMjBnaGNyLmlvJTJGaHVnZ2luZ2ZhY2UlMkZ0ZXh0LWdlbmVyYXRpb24taW5mZXJlbmNlJTNBbGF0ZXN0JTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tb2RlbC1pZCUyMEh1Z2dpbmdGYWNlVEIlMkZTbW9sTE0yLTM2ME0tSW5zdHJ1Y3QlMjAlNUMlMEElMjAlMjAlMjAlMjAtLW1heC10b3RhbC10b2tlbnMlMjA0MDk2JTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tYXgtaW5wdXQtbGVuZ3RoJTIwMzA3MiUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tbWF4LWJhdGNoLXRvdGFsLXRva2VucyUyMDgxOTIlMjAlNUMlMEElMjAlMjAlMjAlMjAtLXdhaXRpbmctc2VydmVkLXJhdGlvJTIwMS4y",highlighted:`docker run --gpus all \\
--shm-size 1g \\
-p 8080:80 \\
-v ~/.cache/huggingface:/data \\
ghcr.io/huggingface/text-generation-inference:latest \\
--model-id HuggingFaceTB/SmolLM2-360M-Instruct \\
--max-total-tokens 4096 \\
--max-input-length 3072 \\
--max-batch-total-tokens 8192 \\
--waiting-served-ratio 1.2`,wrap:!1}}),M=new W({props:{code:"ZnJvbSUyMGh1Z2dpbmdmYWNlX2h1YiUyMGltcG9ydCUyMEluZmVyZW5jZUNsaWVudCUwQSUwQWNsaWVudCUyMCUzRCUyMEluZmVyZW5jZUNsaWVudChtb2RlbCUzRCUyMmh0dHAlM0ElMkYlMkZsb2NhbGhvc3QlM0E4MDgwJTIyKSUwQSUwQSUyMyUyMEFkdmFuY2VkJTIwcGFyYW1ldGVycyUyMGV4YW1wbGUlMEFyZXNwb25zZSUyMCUzRCUyMGNsaWVudC5jaGF0X2NvbXBsZXRpb24oJTBBJTIwJTIwJTIwJTIwbWVzc2FnZXMlM0QlNUIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlN0IlMjJyb2xlJTIyJTNBJTIwJTIyc3lzdGVtJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMllvdSUyMGFyZSUyMGElMjBjcmVhdGl2ZSUyMHN0b3J5dGVsbGVyLiUyMiU3RCUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJ1c2VyJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMldyaXRlJTIwYSUyMGNyZWF0aXZlJTIwc3RvcnklMjIlN0QlMkMlMEElMjAlMjAlMjAlMjAlNUQlMkMlMEElMjAlMjAlMjAlMjB0ZW1wZXJhdHVyZSUzRDAuOCUyQyUwQSUyMCUyMCUyMCUyMG1heF90b2tlbnMlM0QyMDAlMkMlMEElMjAlMjAlMjAlMjB0b3BfcCUzRDAuOTUlMkMlMEEpJTBBcHJpbnQocmVzcG9uc2UuY2hvaWNlcyU1QjAlNUQubWVzc2FnZS5jb250ZW50KSUwQSUwQSUyMyUyMFJhdyUyMHRleHQlMjBnZW5lcmF0aW9uJTBBcmVzcG9uc2UlMjAlM0QlMjBjbGllbnQudGV4dF9nZW5lcmF0aW9uKCUwQSUyMCUyMCUyMCUyMCUyMldyaXRlJTIwYSUyMGNyZWF0aXZlJTIwc3RvcnklMjBhYm91dCUyMHNwYWNlJTIwZXhwbG9yYXRpb24lMjIlMkMlMEElMjAlMjAlMjAlMjBtYXhfbmV3X3Rva2VucyUzRDIwMCUyQyUwQSUyMCUyMCUyMCUyMHRlbXBlcmF0dXJlJTNEMC44JTJDJTBBJTIwJTIwJTIwJTIwdG9wX3AlM0QwLjk1JTJDJTBBJTIwJTIwJTIwJTIwcmVwZXRpdGlvbl9wZW5hbHR5JTNEMS4xJTJDJTBBJTIwJTIwJTIwJTIwZG9fc2FtcGxlJTNEVHJ1ZSUyQyUwQSUyMCUyMCUyMCUyMGRldGFpbHMlM0RUcnVlJTJDJTBBKSUwQXByaW50KHJlc3BvbnNlLmdlbmVyYXRlZF90ZXh0KQ==",highlighted:`<span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
client = InferenceClient(model=<span class="hljs-string">&quot;http://localhost:8080&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>,
max_tokens=<span class="hljs-number">200</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)
<span class="hljs-comment"># Raw text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Write a creative story about space exploration&quot;</span>,
max_new_tokens=<span class="hljs-number">200</span>,
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
repetition_penalty=<span class="hljs-number">1.1</span>,
do_sample=<span class="hljs-literal">True</span>,
details=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">print</span>(response.generated_text)`,wrap:!1}}),S=new W({props:{code:"ZnJvbSUyMG9wZW5haSUyMGltcG9ydCUyME9wZW5BSSUwQSUwQWNsaWVudCUyMCUzRCUyME9wZW5BSShiYXNlX3VybCUzRCUyMmh0dHAlM0ElMkYlMkZsb2NhbGhvc3QlM0E4MDgwJTJGdjElMjIlMkMlMjBhcGlfa2V5JTNEJTIybm90LW5lZWRlZCUyMiklMEElMEElMjMlMjBBZHZhbmNlZCUyMHBhcmFtZXRlcnMlMjBleGFtcGxlJTBBcmVzcG9uc2UlMjAlM0QlMjBjbGllbnQuY2hhdC5jb21wbGV0aW9ucy5jcmVhdGUoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJIdWdnaW5nRmFjZVRCJTJGU21vbExNMi0zNjBNLUluc3RydWN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwbWVzc2FnZXMlM0QlNUIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlN0IlMjJyb2xlJTIyJTNBJTIwJTIyc3lzdGVtJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMllvdSUyMGFyZSUyMGElMjBjcmVhdGl2ZSUyMHN0b3J5dGVsbGVyLiUyMiU3RCUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJ1c2VyJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMldyaXRlJTIwYSUyMGNyZWF0aXZlJTIwc3RvcnklMjIlN0QlMkMlMEElMjAlMjAlMjAlMjAlNUQlMkMlMEElMjAlMjAlMjAlMjB0ZW1wZXJhdHVyZSUzRDAuOCUyQyUyMCUyMCUyMyUyMEhpZ2hlciUyMGZvciUyMG1vcmUlMjBjcmVhdGl2aXR5JTBBKSUwQXByaW50KHJlc3BvbnNlLmNob2ljZXMlNUIwJTVELm1lc3NhZ2UuY29udGVudCk=",highlighted:`<span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
client = OpenAI(base_url=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>, api_key=<span class="hljs-string">&quot;not-needed&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-360M-Instruct&quot;</span>,
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)`,wrap:!1}}),E=new W({props:{code:"LiUyRnNlcnZlciUyMCU1QyUwQSUyMCUyMCUyMCUyMC1tJTIwc21vbGxtMi0xLjdiLWluc3RydWN0LlE0X0tfTS5nZ3VmJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1ob3N0JTIwMC4wLjAuMCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tcG9ydCUyMDgwODAlMjAlNUMlMEElMjAlMjAlMjAlMjAtYyUyMDQwOTYlMjAlNUMlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBDb250ZXh0JTIwc2l6ZSUwQSUyMCUyMCUyMCUyMC0tdGhyZWFkcyUyMDglMjAlNUMlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBDUFUlMjB0aHJlYWRzJTIwdG8lMjB1c2UlMEElMjAlMjAlMjAlMjAtLWJhdGNoLXNpemUlMjA1MTIlMjAlNUMlMjAlMjAlMjAlMjMlMjBCYXRjaCUyMHNpemUlMjBmb3IlMjBwcm9tcHQlMjBldmFsdWF0aW9uJTBBJTIwJTIwJTIwJTIwLS1uLWdwdS1sYXllcnMlMjAwJTIwJTIwJTIwJTIwJTIwJTIzJTIwR1BVJTIwbGF5ZXJzJTIwKDAlMjAlM0QlMjBDUFUlMjBvbmx5KQ==",highlighted:`./server \\
-m smollm2-1.7b-instruct.Q4_K_M.gguf \\
--host 0.0.0.0 \\
--port 8080 \\
-c 4096 \\ <span class="hljs-comment"># Context size</span>
--threads 8 \\ <span class="hljs-comment"># CPU threads to use</span>
--batch-size 512 \\ <span class="hljs-comment"># Batch size for prompt evaluation</span>
--n-gpu-layers 0 <span class="hljs-comment"># GPU layers (0 = CPU only)</span>`,wrap:!1}}),el=new W({props:{code:"ZnJvbSUyMGh1Z2dpbmdmYWNlX2h1YiUyMGltcG9ydCUyMEluZmVyZW5jZUNsaWVudCUwQSUwQWNsaWVudCUyMCUzRCUyMEluZmVyZW5jZUNsaWVudChtb2RlbCUzRCUyMmh0dHAlM0ElMkYlMkZsb2NhbGhvc3QlM0E4MDgwJTJGdjElMjIlMkMlMjB0b2tlbiUzRCUyMnNrLW5vLWtleS1yZXF1aXJlZCUyMiklMEElMEElMjMlMjBBZHZhbmNlZCUyMHBhcmFtZXRlcnMlMjBleGFtcGxlJTBBcmVzcG9uc2UlMjAlM0QlMjBjbGllbnQuY2hhdF9jb21wbGV0aW9uKCUwQSUyMCUyMCUyMCUyMG1lc3NhZ2VzJTNEJTVCJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnN5c3RlbSUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJZb3UlMjBhcmUlMjBhJTIwY3JlYXRpdmUlMjBzdG9yeXRlbGxlci4lMjIlN0QlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlN0IlMjJyb2xlJTIyJTNBJTIwJTIydXNlciUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJXcml0ZSUyMGElMjBjcmVhdGl2ZSUyMHN0b3J5JTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTVEJTJDJTBBJTIwJTIwJTIwJTIwdGVtcGVyYXR1cmUlM0QwLjglMkMlMEElMjAlMjAlMjAlMjBtYXhfdG9rZW5zJTNEMjAwJTJDJTBBJTIwJTIwJTIwJTIwdG9wX3AlM0QwLjk1JTJDJTBBKSUwQXByaW50KHJlc3BvbnNlLmNob2ljZXMlNUIwJTVELm1lc3NhZ2UuY29udGVudCklMEElMEElMjMlMjBGb3IlMjBkaXJlY3QlMjB0ZXh0JTIwZ2VuZXJhdGlvbiUwQXJlc3BvbnNlJTIwJTNEJTIwY2xpZW50LnRleHRfZ2VuZXJhdGlvbiglMEElMjAlMjAlMjAlMjAlMjJXcml0ZSUyMGElMjBjcmVhdGl2ZSUyMHN0b3J5JTIwYWJvdXQlMjBzcGFjZSUyMGV4cGxvcmF0aW9uJTIyJTJDJTBBJTIwJTIwJTIwJTIwbWF4X25ld190b2tlbnMlM0QyMDAlMkMlMEElMjAlMjAlMjAlMjB0ZW1wZXJhdHVyZSUzRDAuOCUyQyUwQSUyMCUyMCUyMCUyMHRvcF9wJTNEMC45NSUyQyUwQSUyMCUyMCUyMCUyMHJlcGV0aXRpb25fcGVuYWx0eSUzRDEuMSUyQyUwQSUyMCUyMCUyMCUyMGRldGFpbHMlM0RUcnVlJTJDJTBBKSUwQXByaW50KHJlc3BvbnNlLmdlbmVyYXRlZF90ZXh0KQ==",highlighted:`<span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
client = InferenceClient(model=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>, token=<span class="hljs-string">&quot;sk-no-key-required&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>,
max_tokens=<span class="hljs-number">200</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)
<span class="hljs-comment"># For direct text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Write a creative story about space exploration&quot;</span>,
max_new_tokens=<span class="hljs-number">200</span>,
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
repetition_penalty=<span class="hljs-number">1.1</span>,
details=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">print</span>(response.generated_text)`,wrap:!1}}),sl=new W({props:{code:"ZnJvbSUyMG9wZW5haSUyMGltcG9ydCUyME9wZW5BSSUwQSUwQWNsaWVudCUyMCUzRCUyME9wZW5BSShiYXNlX3VybCUzRCUyMmh0dHAlM0ElMkYlMkZsb2NhbGhvc3QlM0E4MDgwJTJGdjElMjIlMkMlMjBhcGlfa2V5JTNEJTIyc2stbm8ta2V5LXJlcXVpcmVkJTIyKSUwQSUwQSUyMyUyMEFkdmFuY2VkJTIwcGFyYW1ldGVycyUyMGV4YW1wbGUlMEFyZXNwb25zZSUyMCUzRCUyMGNsaWVudC5jaGF0LmNvbXBsZXRpb25zLmNyZWF0ZSglMEElMjAlMjAlMjAlMjBtb2RlbCUzRCUyMnNtb2xsbTItMS43Yi1pbnN0cnVjdCUyMiUyQyUwQSUyMCUyMCUyMCUyMG1lc3NhZ2VzJTNEJTVCJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnN5c3RlbSUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJZb3UlMjBhcmUlMjBhJTIwY3JlYXRpdmUlMjBzdG9yeXRlbGxlci4lMjIlN0QlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlN0IlMjJyb2xlJTIyJTNBJTIwJTIydXNlciUyMiUyQyUyMCUyMmNvbnRlbnQlMjIlM0ElMjAlMjJXcml0ZSUyMGElMjBjcmVhdGl2ZSUyMHN0b3J5JTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTVEJTJDJTBBJTIwJTIwJTIwJTIwdGVtcGVyYXR1cmUlM0QwLjglMkMlMjAlMjAlMjMlMjBIaWdoZXIlMjBmb3IlMjBtb3JlJTIwY3JlYXRpdml0eSUwQSUyMCUyMCUyMCUyMHRvcF9wJTNEMC45NSUyQyUyMCUyMCUyMyUyME51Y2xldXMlMjBzYW1wbGluZyUyMHByb2JhYmlsaXR5JTBBJTIwJTIwJTIwJTIwZnJlcXVlbmN5X3BlbmFsdHklM0QwLjUlMkMlMjAlMjAlMjMlMjBSZWR1Y2UlMjByZXBldGl0aW9uJTIwb2YlMjBmcmVxdWVudCUyMHRva2VucyUwQSUyMCUyMCUyMCUyMHByZXNlbmNlX3BlbmFsdHklM0QwLjUlMkMlMjAlMjAlMjMlMjBSZWR1Y2UlMjByZXBldGl0aW9uJTIwYnklMjBwZW5hbGl6aW5nJTIwdG9rZW5zJTIwYWxyZWFkeSUyMHByZXNlbnQlMEElMjAlMjAlMjAlMjBtYXhfdG9rZW5zJTNEMjAwJTJDJTIwJTIwJTIzJTIwTWF4aW11bSUyMGdlbmVyYXRpb24lMjBsZW5ndGglMEEpJTBBcHJpbnQocmVzcG9uc2UuY2hvaWNlcyU1QjAlNUQubWVzc2FnZS5jb250ZW50KQ==",highlighted:`<span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
client = OpenAI(base_url=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>, api_key=<span class="hljs-string">&quot;sk-no-key-required&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;smollm2-1.7b-instruct&quot;</span>,
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Nucleus sampling probability</span>
frequency_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Reduce repetition of frequent tokens</span>
presence_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Reduce repetition by penalizing tokens already present</span>
max_tokens=<span class="hljs-number">200</span>, <span class="hljs-comment"># Maximum generation length</span>
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)`,wrap:!1}}),al=new W({props:{code:"JTIzJTIwVXNpbmclMjBsbGFtYS1jcHAtcHl0aG9uJTIwcGFja2FnZSUyMGZvciUyMGRpcmVjdCUyMG1vZGVsJTIwYWNjZXNzJTBBZnJvbSUyMGxsYW1hX2NwcCUyMGltcG9ydCUyMExsYW1hJTBBJTBBJTIzJTIwTG9hZCUyMHRoZSUyMG1vZGVsJTBBbGxtJTIwJTNEJTIwTGxhbWEoJTBBJTIwJTIwJTIwJTIwbW9kZWxfcGF0aCUzRCUyMnNtb2xsbTItMS43Yi1pbnN0cnVjdC5RNF9LX00uZ2d1ZiUyMiUyQyUwQSUyMCUyMCUyMCUyMG5fY3R4JTNENDA5NiUyQyUyMCUyMCUyMyUyMENvbnRleHQlMjB3aW5kb3clMjBzaXplJTBBJTIwJTIwJTIwJTIwbl90aHJlYWRzJTNEOCUyQyUyMCUyMCUyMyUyMENQVSUyMHRocmVhZHMlMEElMjAlMjAlMjAlMjBuX2dwdV9sYXllcnMlM0QwJTJDJTIwJTIwJTIzJTIwR1BVJTIwbGF5ZXJzJTIwKDAlMjAlM0QlMjBDUFUlMjBvbmx5KSUwQSklMEElMEElMjMlMjBGb3JtYXQlMjBwcm9tcHQlMjBhY2NvcmRpbmclMjB0byUyMHRoZSUyMG1vZGVsJ3MlMjBleHBlY3RlZCUyMGZvcm1hdCUwQXByb21wdCUyMCUzRCUyMCUyMiUyMiUyMiUzQyU3Q2ltX3N0YXJ0JTdDJTNFc3lzdGVtJTBBWW91JTIwYXJlJTIwYSUyMGNyZWF0aXZlJTIwc3Rvcnl0ZWxsZXIuJTBBJTNDJTdDaW1fZW5kJTdDJTNFJTBBJTNDJTdDaW1fc3RhcnQlN0MlM0V1c2VyJTBBV3JpdGUlMjBhJTIwY3JlYXRpdmUlMjBzdG9yeSUwQSUzQyU3Q2ltX2VuZCU3QyUzRSUwQSUzQyU3Q2ltX3N0YXJ0JTdDJTNFYXNzaXN0YW50JTBBJTIyJTIyJTIyJTBBJTBBJTIzJTIwR2VuZXJhdGUlMjByZXNwb25zZSUyMHdpdGglMjBwcmVjaXNlJTIwcGFyYW1ldGVyJTIwY29udHJvbCUwQW91dHB1dCUyMCUzRCUyMGxsbSglMEElMjAlMjAlMjAlMjBwcm9tcHQlMkMlMEElMjAlMjAlMjAlMjBtYXhfdG9rZW5zJTNEMjAwJTJDJTBBJTIwJTIwJTIwJTIwdGVtcGVyYXR1cmUlM0QwLjglMkMlMEElMjAlMjAlMjAlMjB0b3BfcCUzRDAuOTUlMkMlMEElMjAlMjAlMjAlMjBmcmVxdWVuY3lfcGVuYWx0eSUzRDAuNSUyQyUwQSUyMCUyMCUyMCUyMHByZXNlbmNlX3BlbmFsdHklM0QwLjUlMkMlMEElMjAlMjAlMjAlMjBzdG9wJTNEJTVCJTIyJTNDJTdDaW1fZW5kJTdDJTNFJTIyJTVEJTJDJTBBKSUwQSUwQXByaW50KG91dHB1dCU1QiUyMmNob2ljZXMlMjIlNUQlNUIwJTVEJTVCJTIydGV4dCUyMiU1RCk=",highlighted:`<span class="hljs-comment"># Using llama-cpp-python package for direct model access</span>
<span class="hljs-keyword">from</span> llama_cpp <span class="hljs-keyword">import</span> Llama
<span class="hljs-comment"># Load the model</span>
llm = Llama(
model_path=<span class="hljs-string">&quot;smollm2-1.7b-instruct.Q4_K_M.gguf&quot;</span>,
n_ctx=<span class="hljs-number">4096</span>, <span class="hljs-comment"># Context window size</span>
n_threads=<span class="hljs-number">8</span>, <span class="hljs-comment"># CPU threads</span>
n_gpu_layers=<span class="hljs-number">0</span>, <span class="hljs-comment"># GPU layers (0 = CPU only)</span>
)
<span class="hljs-comment"># Format prompt according to the model&#x27;s expected format</span>
prompt = <span class="hljs-string">&quot;&quot;&quot;&lt;|im_start|&gt;system
You are a creative storyteller.
&lt;|im_end|&gt;
&lt;|im_start|&gt;user
Write a creative story
&lt;|im_end|&gt;
&lt;|im_start|&gt;assistant
&quot;&quot;&quot;</span>
<span class="hljs-comment"># Generate response with precise parameter control</span>
output = llm(
prompt,
max_tokens=<span class="hljs-number">200</span>,
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
frequency_penalty=<span class="hljs-number">0.5</span>,
presence_penalty=<span class="hljs-number">0.5</span>,
stop=[<span class="hljs-string">&quot;&lt;|im_end|&gt;&quot;</span>],
)
<span class="hljs-built_in">print</span>(output[<span class="hljs-string">&quot;choices&quot;</span>][<span class="hljs-number">0</span>][<span class="hljs-string">&quot;text&quot;</span>])`,wrap:!1}}),_=new W({props:{code:"ZnJvbSUyMGh1Z2dpbmdmYWNlX2h1YiUyMGltcG9ydCUyMEluZmVyZW5jZUNsaWVudCUwQSUwQWNsaWVudCUyMCUzRCUyMEluZmVyZW5jZUNsaWVudChtb2RlbCUzRCUyMmh0dHAlM0ElMkYlMkZsb2NhbGhvc3QlM0E4MDAwJTJGdjElMjIpJTBBJTBBJTIzJTIwQWR2YW5jZWQlMjBwYXJhbWV0ZXJzJTIwZXhhbXBsZSUwQXJlc3BvbnNlJTIwJTNEJTIwY2xpZW50LmNoYXRfY29tcGxldGlvbiglMEElMjAlMjAlMjAlMjBtZXNzYWdlcyUzRCU1QiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJzeXN0ZW0lMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyWW91JTIwYXJlJTIwYSUyMGNyZWF0aXZlJTIwc3Rvcnl0ZWxsZXIuJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyV3JpdGUlMjBhJTIwY3JlYXRpdmUlMjBzdG9yeSUyMiU3RCUyQyUwQSUyMCUyMCUyMCUyMCU1RCUyQyUwQSUyMCUyMCUyMCUyMHRlbXBlcmF0dXJlJTNEMC44JTJDJTBBJTIwJTIwJTIwJTIwbWF4X3Rva2VucyUzRDIwMCUyQyUwQSUyMCUyMCUyMCUyMHRvcF9wJTNEMC45NSUyQyUwQSklMEFwcmludChyZXNwb25zZS5jaG9pY2VzJTVCMCU1RC5tZXNzYWdlLmNvbnRlbnQpJTBBJTBBJTIzJTIwRm9yJTIwZGlyZWN0JTIwdGV4dCUyMGdlbmVyYXRpb24lMEFyZXNwb25zZSUyMCUzRCUyMGNsaWVudC50ZXh0X2dlbmVyYXRpb24oJTBBJTIwJTIwJTIwJTIwJTIyV3JpdGUlMjBhJTIwY3JlYXRpdmUlMjBzdG9yeSUyMGFib3V0JTIwc3BhY2UlMjBleHBsb3JhdGlvbiUyMiUyQyUwQSUyMCUyMCUyMCUyMG1heF9uZXdfdG9rZW5zJTNEMjAwJTJDJTBBJTIwJTIwJTIwJTIwdGVtcGVyYXR1cmUlM0QwLjglMkMlMEElMjAlMjAlMjAlMjB0b3BfcCUzRDAuOTUlMkMlMEElMjAlMjAlMjAlMjBkZXRhaWxzJTNEVHJ1ZSUyQyUwQSklMEFwcmludChyZXNwb25zZS5nZW5lcmF0ZWRfdGV4dCk=",highlighted:`<span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
client = InferenceClient(model=<span class="hljs-string">&quot;http://localhost:8000/v1&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>,
max_tokens=<span class="hljs-number">200</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)
<span class="hljs-comment"># For direct text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Write a creative story about space exploration&quot;</span>,
max_new_tokens=<span class="hljs-number">200</span>,
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
details=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">print</span>(response.generated_text)`,wrap:!1}}),ol=new W({props:{code:"ZnJvbSUyMG9wZW5haSUyMGltcG9ydCUyME9wZW5BSSUwQSUwQWNsaWVudCUyMCUzRCUyME9wZW5BSShiYXNlX3VybCUzRCUyMmh0dHAlM0ElMkYlMkZsb2NhbGhvc3QlM0E4MDAwJTJGdjElMjIlMkMlMjBhcGlfa2V5JTNEJTIybm90LW5lZWRlZCUyMiklMEElMEElMjMlMjBBZHZhbmNlZCUyMHBhcmFtZXRlcnMlMjBleGFtcGxlJTBBcmVzcG9uc2UlMjAlM0QlMjBjbGllbnQuY2hhdC5jb21wbGV0aW9ucy5jcmVhdGUoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJIdWdnaW5nRmFjZVRCJTJGU21vbExNMi0zNjBNLUluc3RydWN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwbWVzc2FnZXMlM0QlNUIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlN0IlMjJyb2xlJTIyJTNBJTIwJTIyc3lzdGVtJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMllvdSUyMGFyZSUyMGElMjBjcmVhdGl2ZSUyMHN0b3J5dGVsbGVyLiUyMiU3RCUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJ1c2VyJTIyJTJDJTIwJTIyY29udGVudCUyMiUzQSUyMCUyMldyaXRlJTIwYSUyMGNyZWF0aXZlJTIwc3RvcnklMjIlN0QlMkMlMEElMjAlMjAlMjAlMjAlNUQlMkMlMEElMjAlMjAlMjAlMjB0ZW1wZXJhdHVyZSUzRDAuOCUyQyUwQSUyMCUyMCUyMCUyMHRvcF9wJTNEMC45NSUyQyUwQSUyMCUyMCUyMCUyMG1heF90b2tlbnMlM0QyMDAlMkMlMEEpJTBBcHJpbnQocmVzcG9uc2UuY2hvaWNlcyU1QjAlNUQubWVzc2FnZS5jb250ZW50KQ==",highlighted:`<span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
client = OpenAI(base_url=<span class="hljs-string">&quot;http://localhost:8000/v1&quot;</span>, api_key=<span class="hljs-string">&quot;not-needed&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-360M-Instruct&quot;</span>,
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
max_tokens=<span class="hljs-number">200</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)`,wrap:!1}}),ml=new W({props:{code:"ZnJvbSUyMHZsbG0lMjBpbXBvcnQlMjBMTE0lMkMlMjBTYW1wbGluZ1BhcmFtcyUwQSUwQSUyMyUyMEluaXRpYWxpemUlMjB0aGUlMjBtb2RlbCUyMHdpdGglMjBhZHZhbmNlZCUyMHBhcmFtZXRlcnMlMEFsbG0lMjAlM0QlMjBMTE0oJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJIdWdnaW5nRmFjZVRCJTJGU21vbExNMi0zNjBNLUluc3RydWN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwZ3B1X21lbW9yeV91dGlsaXphdGlvbiUzRDAuODUlMkMlMEElMjAlMjAlMjAlMjBtYXhfbnVtX2JhdGNoZWRfdG9rZW5zJTNEODE5MiUyQyUwQSUyMCUyMCUyMCUyMG1heF9udW1fc2VxcyUzRDI1NiUyQyUwQSUyMCUyMCUyMCUyMGJsb2NrX3NpemUlM0QxNiUyQyUwQSklMEElMEElMjMlMjBDb25maWd1cmUlMjBzYW1wbGluZyUyMHBhcmFtZXRlcnMlMEFzYW1wbGluZ19wYXJhbXMlMjAlM0QlMjBTYW1wbGluZ1BhcmFtcyglMEElMjAlMjAlMjAlMjB0ZW1wZXJhdHVyZSUzRDAuOCUyQyUyMCUyMCUyMyUyMEhpZ2hlciUyMGZvciUyMG1vcmUlMjBjcmVhdGl2aXR5JTBBJTIwJTIwJTIwJTIwdG9wX3AlM0QwLjk1JTJDJTIwJTIwJTIzJTIwQ29uc2lkZXIlMjB0b3AlMjA5NSUyNSUyMHByb2JhYmlsaXR5JTIwbWFzcyUwQSUyMCUyMCUyMCUyMG1heF90b2tlbnMlM0QxMDAlMkMlMjAlMjAlMjMlMjBNYXhpbXVtJTIwbGVuZ3RoJTBBJTIwJTIwJTIwJTIwcHJlc2VuY2VfcGVuYWx0eSUzRDEuMSUyQyUyMCUyMCUyMyUyMFJlZHVjZSUyMHJlcGV0aXRpb24lMEElMjAlMjAlMjAlMjBmcmVxdWVuY3lfcGVuYWx0eSUzRDEuMSUyQyUyMCUyMCUyMyUyMFJlZHVjZSUyMHJlcGV0aXRpb24lMEElMjAlMjAlMjAlMjBzdG9wJTNEJTVCJTIyJTVDbiU1Q24lMjIlMkMlMjAlMjIlMjMlMjMlMjMlMjIlNUQlMkMlMjAlMjAlMjMlMjBTdG9wJTIwc2VxdWVuY2VzJTBBKSUwQSUwQSUyMyUyMEdlbmVyYXRlJTIwdGV4dCUwQXByb21wdCUyMCUzRCUyMCUyMldyaXRlJTIwYSUyMGNyZWF0aXZlJTIwc3RvcnklMjIlMEFvdXRwdXRzJTIwJTNEJTIwbGxtLmdlbmVyYXRlKHByb21wdCUyQyUyMHNhbXBsaW5nX3BhcmFtcyklMEFwcmludChvdXRwdXRzJTVCMCU1RC5vdXRwdXRzJTVCMCU1RC50ZXh0KSUwQSUwQSUyMyUyMEZvciUyMGNoYXQtc3R5bGUlMjBpbnRlcmFjdGlvbnMlMEFjaGF0X3Byb21wdCUyMCUzRCUyMCU1QiUwQSUyMCUyMCUyMCUyMCU3QiUyMnJvbGUlMjIlM0ElMjAlMjJzeXN0ZW0lMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyWW91JTIwYXJlJTIwYSUyMGNyZWF0aXZlJTIwc3Rvcnl0ZWxsZXIuJTIyJTdEJTJDJTBBJTIwJTIwJTIwJTIwJTdCJTIycm9sZSUyMiUzQSUyMCUyMnVzZXIlMjIlMkMlMjAlMjJjb250ZW50JTIyJTNBJTIwJTIyV3JpdGUlMjBhJTIwY3JlYXRpdmUlMjBzdG9yeSUyMiU3RCUyQyUwQSU1RCUwQWZvcm1hdHRlZF9wcm9tcHQlMjAlM0QlMjBsbG0uZ2V0X2NoYXRfdGVtcGxhdGUoKShjaGF0X3Byb21wdCklMjAlMjAlMjMlMjBVc2VzJTIwbW9kZWwncyUyMGNoYXQlMjB0ZW1wbGF0ZSUwQW91dHB1dHMlMjAlM0QlMjBsbG0uZ2VuZXJhdGUoZm9ybWF0dGVkX3Byb21wdCUyQyUyMHNhbXBsaW5nX3BhcmFtcyklMEFwcmludChvdXRwdXRzJTVCMCU1RC5vdXRwdXRzJTVCMCU1RC50ZXh0KQ==",highlighted:`<span class="hljs-keyword">from</span> vllm <span class="hljs-keyword">import</span> LLM, SamplingParams
<span class="hljs-comment"># Initialize the model with advanced parameters</span>
llm = LLM(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-360M-Instruct&quot;</span>,
gpu_memory_utilization=<span class="hljs-number">0.85</span>,
max_num_batched_tokens=<span class="hljs-number">8192</span>,
max_num_seqs=<span class="hljs-number">256</span>,
block_size=<span class="hljs-number">16</span>,
)
<span class="hljs-comment"># Configure sampling parameters</span>
sampling_params = SamplingParams(
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span>
max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span>
presence_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span>
frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span>
stop=[<span class="hljs-string">&quot;\\n\\n&quot;</span>, <span class="hljs-string">&quot;###&quot;</span>], <span class="hljs-comment"># Stop sequences</span>
)
<span class="hljs-comment"># Generate text</span>
prompt = <span class="hljs-string">&quot;Write a creative story&quot;</span>
outputs = llm.generate(prompt, sampling_params)
<span class="hljs-built_in">print</span>(outputs[<span class="hljs-number">0</span>].outputs[<span class="hljs-number">0</span>].text)
<span class="hljs-comment"># For chat-style interactions</span>
chat_prompt = [
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
]
formatted_prompt = llm.get_chat_template()(chat_prompt) <span class="hljs-comment"># Uses model&#x27;s chat template</span>
outputs = llm.generate(formatted_prompt, sampling_params)
<span class="hljs-built_in">print</span>(outputs[<span class="hljs-number">0</span>].outputs[<span class="hljs-number">0</span>].text)`,wrap:!1}}),{c(){y=X(`<hfoption value="tgi" label="TGI">
`),r=w("p"),r.textContent=U,J=o(),g(T.$$.fragment),c=o(),m=w("p"),m.textContent=I,s=o(),g(M.$$.fragment),Q=o(),j=w("p"),j.textContent=V,H=o(),g(S.$$.fragment),cl=X(`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),N=w("p"),N.textContent=x,D=o(),g(E.$$.fragment),rl=o(),R=w("p"),R.textContent=u,$=o(),g(el.$$.fragment),q=o(),tl=w("p"),tl.textContent=P,bl=o(),g(sl.$$.fragment),Y=o(),nl=w("p"),nl.textContent=K,dl=o(),g(al.$$.fragment),z=X(`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),Ml=w("p"),Ml.textContent=O,jl=o(),g(_.$$.fragment),Jl=o(),pl=w("p"),pl.textContent=ll,Cl=o(),g(ol.$$.fragment),F=o(),Tl=w("p"),Tl.textContent=wl,L=o(),g(ml.$$.fragment),il=X(`
</hfoption>`)},l(a){y=k(a,`<hfoption value="tgi" label="TGI">
`),r=h(a,"P",{"data-svelte-h":!0}),f(r)!=="svelte-p4nmid"&&(r.textContent=U),J=i(a),B(T.$$.fragment,a),c=i(a),m=h(a,"P",{"data-svelte-h":!0}),f(m)!=="svelte-8cq2xt"&&(m.textContent=I),s=i(a),B(M.$$.fragment,a),Q=i(a),j=h(a,"P",{"data-svelte-h":!0}),f(j)!=="svelte-gwduza"&&(j.textContent=V),H=i(a),B(S.$$.fragment,a),cl=k(a,`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),N=h(a,"P",{"data-svelte-h":!0}),f(N)!=="svelte-b43ol"&&(N.textContent=x),D=i(a),B(E.$$.fragment,a),rl=i(a),R=h(a,"P",{"data-svelte-h":!0}),f(R)!=="svelte-u8i4ra"&&(R.textContent=u),$=i(a),B(el.$$.fragment,a),q=i(a),tl=h(a,"P",{"data-svelte-h":!0}),f(tl)!=="svelte-1jxmijw"&&(tl.textContent=P),bl=i(a),B(sl.$$.fragment,a),Y=i(a),nl=h(a,"P",{"data-svelte-h":!0}),f(nl)!=="svelte-1yadfem"&&(nl.textContent=K),dl=i(a),B(al.$$.fragment,a),z=k(a,`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),Ml=h(a,"P",{"data-svelte-h":!0}),f(Ml)!=="svelte-jbmhsw"&&(Ml.textContent=O),jl=i(a),B(_.$$.fragment,a),Jl=i(a),pl=h(a,"P",{"data-svelte-h":!0}),f(pl)!=="svelte-m9xqnx"&&(pl.textContent=ll),Cl=i(a),B(ol.$$.fragment,a),F=i(a),Tl=h(a,"P",{"data-svelte-h":!0}),f(Tl)!=="svelte-1xs95kf"&&(Tl.textContent=wl),L=i(a),B(ml.$$.fragment,a),il=k(a,`
</hfoption>`)},m(a,b){t(a,y,b),t(a,r,b),t(a,J,b),G(T,a,b),t(a,c,b),t(a,m,b),t(a,s,b),G(M,a,b),t(a,Q,b),t(a,j,b),t(a,H,b),G(S,a,b),t(a,cl,b),t(a,N,b),t(a,D,b),G(E,a,b),t(a,rl,b),t(a,R,b),t(a,$,b),G(el,a,b),t(a,q,b),t(a,tl,b),t(a,bl,b),G(sl,a,b),t(a,Y,b),t(a,nl,b),t(a,dl,b),G(al,a,b),t(a,z,b),t(a,Ml,b),t(a,jl,b),G(_,a,b),t(a,Jl,b),t(a,pl,b),t(a,Cl,b),G(ol,a,b),t(a,F,b),t(a,Tl,b),t(a,L,b),G(ml,a,b),t(a,il,b),Ul=!0},p:Gl,i(a){Ul||(d(T.$$.fragment,a),d(M.$$.fragment,a),d(S.$$.fragment,a),d(E.$$.fragment,a),d(el.$$.fragment,a),d(sl.$$.fragment,a),d(al.$$.fragment,a),d(_.$$.fragment,a),d(ol.$$.fragment,a),d(ml.$$.fragment,a),Ul=!0)},o(a){C(T.$$.fragment,a),C(M.$$.fragment,a),C(S.$$.fragment,a),C(E.$$.fragment,a),C(el.$$.fragment,a),C(sl.$$.fragment,a),C(al.$$.fragment,a),C(_.$$.fragment,a),C(ol.$$.fragment,a),C(ml.$$.fragment,a),Ul=!1},d(a){a&&(e(y),e(r),e(J),e(c),e(m),e(s),e(Q),e(j),e(H),e(cl),e(N),e(D),e(rl),e(R),e($),e(q),e(tl),e(bl),e(Y),e(nl),e(dl),e(z),e(Ml),e(jl),e(Jl),e(pl),e(Cl),e(F),e(Tl),e(L),e(il)),Z(T,a),Z(M,a),Z(S,a),Z(E,a),Z(el,a),Z(sl,a),Z(al,a),Z(_,a),Z(ol,a),Z(ml,a)}}}function mt(A){let y,r,U,J,T,c,m,I;return r=new W({props:{code:"Y2xpZW50LmdlbmVyYXRlKCUwQSUyMCUyMCUyMCUyMCUyMldyaXRlJTIwYSUyMGNyZWF0aXZlJTIwc3RvcnklMjIlMkMlMEElMjAlMjAlMjAlMjB0ZW1wZXJhdHVyZSUzRDAuOCUyQyUyMCUyMCUyMyUyMEhpZ2hlciUyMGZvciUyMG1vcmUlMjBjcmVhdGl2aXR5JTBBJTIwJTIwJTIwJTIwdG9wX3AlM0QwLjk1JTJDJTIwJTIwJTIzJTIwQ29uc2lkZXIlMjB0b3AlMjA5NSUyNSUyMHByb2JhYmlsaXR5JTIwbWFzcyUwQSUyMCUyMCUyMCUyMHRvcF9rJTNENTAlMkMlMjAlMjAlMjMlMjBDb25zaWRlciUyMHRvcCUyMDUwJTIwdG9rZW5zJTBBJTIwJTIwJTIwJTIwbWF4X25ld190b2tlbnMlM0QxMDAlMkMlMjAlMjAlMjMlMjBNYXhpbXVtJTIwbGVuZ3RoJTBBJTIwJTIwJTIwJTIwcmVwZXRpdGlvbl9wZW5hbHR5JTNEMS4xJTJDJTIwJTIwJTIzJTIwUmVkdWNlJTIwcmVwZXRpdGlvbiUwQSk=",highlighted:`client.generate(
<span class="hljs-string">&quot;Write a creative story&quot;</span>,
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span>
top_k=<span class="hljs-number">50</span>, <span class="hljs-comment"># Consider top 50 tokens</span>
max_new_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span>
repetition_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span>
)`,wrap:!1}}),J=new W({props:{code:"JTIzJTIwVmlhJTIwT3BlbkFJJTIwQVBJJTIwY29tcGF0aWJpbGl0eSUwQXJlc3BvbnNlJTIwJTNEJTIwY2xpZW50LmNvbXBsZXRpb25zLmNyZWF0ZSglMEElMjAlMjAlMjAlMjBtb2RlbCUzRCUyMnNtb2xsbTItMS43Yi1pbnN0cnVjdCUyMiUyQyUyMCUyMCUyMyUyME1vZGVsJTIwbmFtZSUyMChjYW4lMjBiZSUyMGFueSUyMHN0cmluZyUyMGZvciUyMGxsYW1hLmNwcCUyMHNlcnZlciklMEElMjAlMjAlMjAlMjBwcm9tcHQlM0QlMjJXcml0ZSUyMGElMjBjcmVhdGl2ZSUyMHN0b3J5JTIyJTJDJTBBJTIwJTIwJTIwJTIwdGVtcGVyYXR1cmUlM0QwLjglMkMlMjAlMjAlMjMlMjBIaWdoZXIlMjBmb3IlMjBtb3JlJTIwY3JlYXRpdml0eSUwQSUyMCUyMCUyMCUyMHRvcF9wJTNEMC45NSUyQyUyMCUyMCUyMyUyMENvbnNpZGVyJTIwdG9wJTIwOTUlMjUlMjBwcm9iYWJpbGl0eSUyMG1hc3MlMEElMjAlMjAlMjAlMjBmcmVxdWVuY3lfcGVuYWx0eSUzRDEuMSUyQyUyMCUyMCUyMyUyMFJlZHVjZSUyMHJlcGV0aXRpb24lMEElMjAlMjAlMjAlMjBwcmVzZW5jZV9wZW5hbHR5JTNEMC4xJTJDJTIwJTIwJTIzJTIwUmVkdWNlJTIwcmVwZXRpdGlvbiUwQSUyMCUyMCUyMCUyMG1heF90b2tlbnMlM0QxMDAlMkMlMjAlMjAlMjMlMjBNYXhpbXVtJTIwbGVuZ3RoJTBBKSUwQSUwQSUyMyUyMFZpYSUyMGxsYW1hLWNwcC1weXRob24lMjBkaXJlY3QlMjBhY2Nlc3MlMEFvdXRwdXQlMjAlM0QlMjBsbG0oJTBBJTIwJTIwJTIwJTIwJTIyV3JpdGUlMjBhJTIwY3JlYXRpdmUlMjBzdG9yeSUyMiUyQyUwQSUyMCUyMCUyMCUyMHRlbXBlcmF0dXJlJTNEMC44JTJDJTBBJTIwJTIwJTIwJTIwdG9wX3AlM0QwLjk1JTJDJTBBJTIwJTIwJTIwJTIwdG9wX2slM0Q1MCUyQyUwQSUyMCUyMCUyMCUyMG1heF90b2tlbnMlM0QxMDAlMkMlMEElMjAlMjAlMjAlMjByZXBlYXRfcGVuYWx0eSUzRDEuMSUyQyUwQSk=",highlighted:`<span class="hljs-comment"># Via OpenAI API compatibility</span>
response = client.completions.create(
model=<span class="hljs-string">&quot;smollm2-1.7b-instruct&quot;</span>, <span class="hljs-comment"># Model name (can be any string for llama.cpp server)</span>
prompt=<span class="hljs-string">&quot;Write a creative story&quot;</span>,
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span>
frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span>
presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Reduce repetition</span>
max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span>
)
<span class="hljs-comment"># Via llama-cpp-python direct access</span>
output = llm(
<span class="hljs-string">&quot;Write a creative story&quot;</span>,
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
top_k=<span class="hljs-number">50</span>,
max_tokens=<span class="hljs-number">100</span>,
repeat_penalty=<span class="hljs-number">1.1</span>,
)`,wrap:!1}}),c=new W({props:{code:"cGFyYW1zJTIwJTNEJTIwU2FtcGxpbmdQYXJhbXMoJTBBJTIwJTIwJTIwJTIwdGVtcGVyYXR1cmUlM0QwLjglMkMlMjAlMjAlMjMlMjBIaWdoZXIlMjBmb3IlMjBtb3JlJTIwY3JlYXRpdml0eSUwQSUyMCUyMCUyMCUyMHRvcF9wJTNEMC45NSUyQyUyMCUyMCUyMyUyMENvbnNpZGVyJTIwdG9wJTIwOTUlMjUlMjBwcm9iYWJpbGl0eSUyMG1hc3MlMEElMjAlMjAlMjAlMjB0b3BfayUzRDUwJTJDJTIwJTIwJTIzJTIwQ29uc2lkZXIlMjB0b3AlMjA1MCUyMHRva2VucyUwQSUyMCUyMCUyMCUyMG1heF90b2tlbnMlM0QxMDAlMkMlMjAlMjAlMjMlMjBNYXhpbXVtJTIwbGVuZ3RoJTBBJTIwJTIwJTIwJTIwcHJlc2VuY2VfcGVuYWx0eSUzRDAuMSUyQyUyMCUyMCUyMyUyMFJlZHVjZSUyMHJlcGV0aXRpb24lMEEpJTBBbGxtLmdlbmVyYXRlKCUyMldyaXRlJTIwYSUyMGNyZWF0aXZlJTIwc3RvcnklMjIlMkMlMjBzYW1wbGluZ19wYXJhbXMlM0RwYXJhbXMp",highlighted:`params = SamplingParams(
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span>
top_k=<span class="hljs-number">50</span>, <span class="hljs-comment"># Consider top 50 tokens</span>
max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span>
presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Reduce repetition</span>
)
llm.generate(<span class="hljs-string">&quot;Write a creative story&quot;</span>, sampling_params=params)`,wrap:!1}}),{c(){y=X(`<hfoption value="tgi" label="TGI">
`),g(r.$$.fragment),U=X(`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),g(J.$$.fragment),T=X(`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),g(c.$$.fragment),m=X(`
</hfoption>`)},l(s){y=k(s,`<hfoption value="tgi" label="TGI">
`),B(r.$$.fragment,s),U=k(s,`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),B(J.$$.fragment,s),T=k(s,`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),B(c.$$.fragment,s),m=k(s,`
</hfoption>`)},m(s,M){t(s,y,M),G(r,s,M),t(s,U,M),G(J,s,M),t(s,T,M),G(c,s,M),t(s,m,M),I=!0},p:Gl,i(s){I||(d(r.$$.fragment,s),d(J.$$.fragment,s),d(c.$$.fragment,s),I=!0)},o(s){C(r.$$.fragment,s),C(J.$$.fragment,s),C(c.$$.fragment,s),I=!1},d(s){s&&(e(y),e(U),e(T),e(m)),Z(r,s),Z(J,s),Z(c,s)}}}function Ut(A){let y,r,U,J,T,c,m,I;return r=new W({props:{code:"Y2xpZW50LmdlbmVyYXRlKCUwQSUyMCUyMCUyMCUyMCUyMldyaXRlJTIwYSUyMHZhcmllZCUyMHRleHQlMjIlMkMlMEElMjAlMjAlMjAlMjByZXBldGl0aW9uX3BlbmFsdHklM0QxLjElMkMlMjAlMjAlMjMlMjBQZW5hbGl6ZSUyMHJlcGVhdGVkJTIwdG9rZW5zJTBBJTIwJTIwJTIwJTIwbm9fcmVwZWF0X25ncmFtX3NpemUlM0QzJTJDJTIwJTIwJTIzJTIwUHJldmVudCUyMDMtZ3JhbSUyMHJlcGV0aXRpb24lMEEp",highlighted:`client.generate(
<span class="hljs-string">&quot;Write a varied text&quot;</span>,
repetition_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalize repeated tokens</span>
no_repeat_ngram_size=<span class="hljs-number">3</span>, <span class="hljs-comment"># Prevent 3-gram repetition</span>
)`,wrap:!1}}),J=new W({props:{code:"JTIzJTIwVmlhJTIwT3BlbkFJJTIwQVBJJTBBcmVzcG9uc2UlMjAlM0QlMjBjbGllbnQuY29tcGxldGlvbnMuY3JlYXRlKCUwQSUyMCUyMCUyMCUyMG1vZGVsJTNEJTIyc21vbGxtMi0xLjdiLWluc3RydWN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwcHJvbXB0JTNEJTIyV3JpdGUlMjBhJTIwdmFyaWVkJTIwdGV4dCUyMiUyQyUwQSUyMCUyMCUyMCUyMGZyZXF1ZW5jeV9wZW5hbHR5JTNEMS4xJTJDJTIwJTIwJTIzJTIwUGVuYWxpemUlMjBmcmVxdWVudCUyMHRva2VucyUwQSUyMCUyMCUyMCUyMHByZXNlbmNlX3BlbmFsdHklM0QwLjglMkMlMjAlMjAlMjMlMjBQZW5hbGl6ZSUyMHRva2VucyUyMGFscmVhZHklMjBwcmVzZW50JTBBKSUwQSUwQSUyMyUyMFZpYSUyMGRpcmVjdCUyMGxpYnJhcnklMEFvdXRwdXQlMjAlM0QlMjBsbG0oJTBBJTIwJTIwJTIwJTIwJTIyV3JpdGUlMjBhJTIwdmFyaWVkJTIwdGV4dCUyMiUyQyUwQSUyMCUyMCUyMCUyMHJlcGVhdF9wZW5hbHR5JTNEMS4xJTJDJTIwJTIwJTIzJTIwUGVuYWxpemUlMjByZXBlYXRlZCUyMHRva2VucyUwQSUyMCUyMCUyMCUyMGZyZXF1ZW5jeV9wZW5hbHR5JTNEMC41JTJDJTIwJTIwJTIzJTIwQWRkaXRpb25hbCUyMGZyZXF1ZW5jeSUyMHBlbmFsdHklMEElMjAlMjAlMjAlMjBwcmVzZW5jZV9wZW5hbHR5JTNEMC41JTJDJTIwJTIwJTIzJTIwQWRkaXRpb25hbCUyMHByZXNlbmNlJTIwcGVuYWx0eSUwQSk=",highlighted:`<span class="hljs-comment"># Via OpenAI API</span>
response = client.completions.create(
model=<span class="hljs-string">&quot;smollm2-1.7b-instruct&quot;</span>,
prompt=<span class="hljs-string">&quot;Write a varied text&quot;</span>,
frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalize frequent tokens</span>
presence_penalty=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Penalize tokens already present</span>
)
<span class="hljs-comment"># Via direct library</span>
output = llm(
<span class="hljs-string">&quot;Write a varied text&quot;</span>,
repeat_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalize repeated tokens</span>
frequency_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Additional frequency penalty</span>
presence_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Additional presence penalty</span>
)`,wrap:!1}}),c=new W({props:{code:"cGFyYW1zJTIwJTNEJTIwU2FtcGxpbmdQYXJhbXMoJTBBJTIwJTIwJTIwJTIwcHJlc2VuY2VfcGVuYWx0eSUzRDAuMSUyQyUyMCUyMCUyMyUyMFBlbmFsaXplJTIwdG9rZW4lMjBwcmVzZW5jZSUwQSUyMCUyMCUyMCUyMGZyZXF1ZW5jeV9wZW5hbHR5JTNEMC4xJTJDJTIwJTIwJTIzJTIwUGVuYWxpemUlMjB0b2tlbiUyMGZyZXF1ZW5jeSUwQSk=",highlighted:`params = SamplingParams(
presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Penalize token presence</span>
frequency_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Penalize token frequency</span>
)`,wrap:!1}}),{c(){y=X(`<hfoption value="tgi" label="TGI">
`),g(r.$$.fragment),U=X(`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),g(J.$$.fragment),T=X(`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),g(c.$$.fragment),m=X(`
</hfoption>`)},l(s){y=k(s,`<hfoption value="tgi" label="TGI">
`),B(r.$$.fragment,s),U=k(s,`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),B(J.$$.fragment,s),T=k(s,`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),B(c.$$.fragment,s),m=k(s,`
</hfoption>`)},m(s,M){t(s,y,M),G(r,s,M),t(s,U,M),G(J,s,M),t(s,T,M),G(c,s,M),t(s,m,M),I=!0},p:Gl,i(s){I||(d(r.$$.fragment,s),d(J.$$.fragment,s),d(c.$$.fragment,s),I=!0)},o(s){C(r.$$.fragment,s),C(J.$$.fragment,s),C(c.$$.fragment,s),I=!1},d(s){s&&(e(y),e(U),e(T),e(m)),Z(r,s),Z(J,s),Z(c,s)}}}function Tt(A){let y,r,U,J,T,c,m,I;return r=new W({props:{code:"Y2xpZW50LmdlbmVyYXRlKCUwQSUyMCUyMCUyMCUyMCUyMkdlbmVyYXRlJTIwYSUyMHNob3J0JTIwcGFyYWdyYXBoJTIyJTJDJTBBJTIwJTIwJTIwJTIwbWF4X25ld190b2tlbnMlM0QxMDAlMkMlMEElMjAlMjAlMjAlMjBtaW5fbmV3X3Rva2VucyUzRDEwJTJDJTBBJTIwJTIwJTIwJTIwc3RvcF9zZXF1ZW5jZXMlM0QlNUIlMjIlNUNuJTVDbiUyMiUyQyUyMCUyMiUyMyUyMyUyMyUyMiU1RCUyQyUwQSk=",highlighted:`client.generate(
<span class="hljs-string">&quot;Generate a short paragraph&quot;</span>,
max_new_tokens=<span class="hljs-number">100</span>,
min_new_tokens=<span class="hljs-number">10</span>,
stop_sequences=[<span class="hljs-string">&quot;\\n\\n&quot;</span>, <span class="hljs-string">&quot;###&quot;</span>],
)`,wrap:!1}}),J=new W({props:{code:"JTIzJTIwVmlhJTIwT3BlbkFJJTIwQVBJJTBBcmVzcG9uc2UlMjAlM0QlMjBjbGllbnQuY29tcGxldGlvbnMuY3JlYXRlKCUwQSUyMCUyMCUyMCUyMG1vZGVsJTNEJTIyc21vbGxtMi0xLjdiLWluc3RydWN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwcHJvbXB0JTNEJTIyR2VuZXJhdGUlMjBhJTIwc2hvcnQlMjBwYXJhZ3JhcGglMjIlMkMlMEElMjAlMjAlMjAlMjBtYXhfdG9rZW5zJTNEMTAwJTJDJTBBJTIwJTIwJTIwJTIwc3RvcCUzRCU1QiUyMiU1Q24lNUNuJTIyJTJDJTIwJTIyJTIzJTIzJTIzJTIyJTVEJTJDJTBBKSUwQSUwQSUyMyUyMFZpYSUyMGRpcmVjdCUyMGxpYnJhcnklMEFvdXRwdXQlMjAlM0QlMjBsbG0oJTIyR2VuZXJhdGUlMjBhJTIwc2hvcnQlMjBwYXJhZ3JhcGglMjIlMkMlMjBtYXhfdG9rZW5zJTNEMTAwJTJDJTIwc3RvcCUzRCU1QiUyMiU1Q24lNUNuJTIyJTJDJTIwJTIyJTIzJTIzJTIzJTIyJTVEKQ==",highlighted:`<span class="hljs-comment"># Via OpenAI API</span>
response = client.completions.create(
model=<span class="hljs-string">&quot;smollm2-1.7b-instruct&quot;</span>,
prompt=<span class="hljs-string">&quot;Generate a short paragraph&quot;</span>,
max_tokens=<span class="hljs-number">100</span>,
stop=[<span class="hljs-string">&quot;\\n\\n&quot;</span>, <span class="hljs-string">&quot;###&quot;</span>],
)
<span class="hljs-comment"># Via direct library</span>
output = llm(<span class="hljs-string">&quot;Generate a short paragraph&quot;</span>, max_tokens=<span class="hljs-number">100</span>, stop=[<span class="hljs-string">&quot;\\n\\n&quot;</span>, <span class="hljs-string">&quot;###&quot;</span>])`,wrap:!1}}),c=new W({props:{code:"cGFyYW1zJTIwJTNEJTIwU2FtcGxpbmdQYXJhbXMoJTBBJTIwJTIwJTIwJTIwbWF4X3Rva2VucyUzRDEwMCUyQyUwQSUyMCUyMCUyMCUyMG1pbl90b2tlbnMlM0QxMCUyQyUwQSUyMCUyMCUyMCUyMHN0b3AlM0QlNUIlMjIlMjMlMjMlMjMlMjIlMkMlMjAlMjIlNUNuJTVDbiUyMiU1RCUyQyUwQSUyMCUyMCUyMCUyMGlnbm9yZV9lb3MlM0RGYWxzZSUyQyUwQSUyMCUyMCUyMCUyMHNraXBfc3BlY2lhbF90b2tlbnMlM0RUcnVlJTJDJTBBKQ==",highlighted:`params = SamplingParams(
max_tokens=<span class="hljs-number">100</span>,
min_tokens=<span class="hljs-number">10</span>,
stop=[<span class="hljs-string">&quot;###&quot;</span>, <span class="hljs-string">&quot;\\n\\n&quot;</span>],
ignore_eos=<span class="hljs-literal">False</span>,
skip_special_tokens=<span class="hljs-literal">True</span>,
)`,wrap:!1}}),{c(){y=X(`<hfoption value="tgi" label="TGI">
`),g(r.$$.fragment),U=X(`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),g(J.$$.fragment),T=X(`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),g(c.$$.fragment),m=X(`
</hfoption>`)},l(s){y=k(s,`<hfoption value="tgi" label="TGI">
`),B(r.$$.fragment,s),U=k(s,`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),B(J.$$.fragment,s),T=k(s,`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),B(c.$$.fragment,s),m=k(s,`
</hfoption>`)},m(s,M){t(s,y,M),G(r,s,M),t(s,U,M),G(J,s,M),t(s,T,M),G(c,s,M),t(s,m,M),I=!0},p:Gl,i(s){I||(d(r.$$.fragment,s),d(J.$$.fragment,s),d(c.$$.fragment,s),I=!0)},o(s){C(r.$$.fragment,s),C(J.$$.fragment,s),C(c.$$.fragment,s),I=!1},d(s){s&&(e(y),e(U),e(T),e(m)),Z(r,s),Z(J,s),Z(c,s)}}}function ut(A){let y,r,U="TGI uses Flash Attention 2 and continuous batching:",J,T,c,m,I="llama.cpp uses quantization and optimized memory layout:",s,M,Q,j,V="For models too large for your GPU, you can use CPU offloading:",H,S,cl,N,x="vLLM uses PagedAttention for optimal memory management:",D,E,rl,R;return T=new W({props:{code:"JTIzJTIwRG9ja2VyJTIwZGVwbG95bWVudCUyMHdpdGglMjBtZW1vcnklMjBvcHRpbWl6YXRpb24lMEFkb2NrZXIlMjBydW4lMjAtLWdwdXMlMjBhbGwlMjAtcCUyMDgwODAlM0E4MCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tc2htLXNpemUlMjAxZyUyMCU1QyUwQSUyMCUyMCUyMCUyMGdoY3IuaW8lMkZodWdnaW5nZmFjZSUyRnRleHQtZ2VuZXJhdGlvbi1pbmZlcmVuY2UlM0FsYXRlc3QlMjAlNUMlMEElMjAlMjAlMjAlMjAtLW1vZGVsLWlkJTIwSHVnZ2luZ0ZhY2VUQiUyRlNtb2xMTTItMS43Qi1JbnN0cnVjdCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tbWF4LWJhdGNoLXRvdGFsLXRva2VucyUyMDgxOTIlMjAlNUMlMEElMjAlMjAlMjAlMjAtLW1heC1pbnB1dC1sZW5ndGglMjA0MDk2",highlighted:`<span class="hljs-comment"># Docker deployment with memory optimization</span>
docker run --gpus all -p 8080:80 \\
--shm-size 1g \\
ghcr.io/huggingface/text-generation-inference:latest \\
--model-id HuggingFaceTB/SmolLM2-1.7B-Instruct \\
--max-batch-total-tokens 8192 \\
--max-input-length 4096`,wrap:!1}}),M=new W({props:{code:"JTIzJTIwU2VydmVyJTIwd2l0aCUyMG1lbW9yeSUyMG9wdGltaXphdGlvbnMlMEEuJTJGc2VydmVyJTIwJTVDJTBBJTIwJTIwJTIwJTIwLW0lMjBzbW9sbG0yLTEuN2ItaW5zdHJ1Y3QuUTRfS19NLmdndWYlMjAlNUMlMEElMjAlMjAlMjAlMjAtLWhvc3QlMjAwLjAuMC4wJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1wb3J0JTIwODA4MCUyMCU1QyUwQSUyMCUyMCUyMCUyMC1jJTIwMjA0OCUyMCU1QyUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMENvbnRleHQlMjBzaXplJTBBJTIwJTIwJTIwJTIwLS10aHJlYWRzJTIwNCUyMCU1QyUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMENQVSUyMHRocmVhZHMlMEElMjAlMjAlMjAlMjAtLW4tZ3B1LWxheWVycyUyMDMyJTIwJTVDJTIwJTIwJTIwJTIwJTIwJTIzJTIwVXNlJTIwbW9yZSUyMEdQVSUyMGxheWVycyUyMGZvciUyMGxhcmdlciUyMG1vZGVscyUwQSUyMCUyMCUyMCUyMC0tbWxvY2slMjAlNUMlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBMb2NrJTIwbWVtb3J5JTIwdG8lMjBwcmV2ZW50JTIwc3dhcHBpbmclMEElMjAlMjAlMjAlMjAtLWNvbnQtYmF0Y2hpbmclMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBFbmFibGUlMjBjb250aW51b3VzJTIwYmF0Y2hpbmc=",highlighted:`<span class="hljs-comment"># Server with memory optimizations</span>
./server \\
-m smollm2-1.7b-instruct.Q4_K_M.gguf \\
--host 0.0.0.0 \\
--port 8080 \\
-c 2048 \\ <span class="hljs-comment"># Context size</span>
--threads 4 \\ <span class="hljs-comment"># CPU threads</span>
--n-gpu-layers 32 \\ <span class="hljs-comment"># Use more GPU layers for larger models</span>
--mlock \\ <span class="hljs-comment"># Lock memory to prevent swapping</span>
--cont-batching <span class="hljs-comment"># Enable continuous batching</span>`,wrap:!1}}),S=new W({props:{code:"LiUyRnNlcnZlciUyMCU1QyUwQSUyMCUyMCUyMCUyMC1tJTIwc21vbGxtMi0xLjdiLWluc3RydWN0LlE0X0tfTS5nZ3VmJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1uLWdwdS1sYXllcnMlMjAyMCUyMCU1QyUyMCUyMCUyMCUyMCUyMCUyMyUyMEtlZXAlMjBmaXJzdCUyMDIwJTIwbGF5ZXJzJTIwb24lMjBHUFUlMEElMjAlMjAlMjAlMjAtLXRocmVhZHMlMjA4JTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwVXNlJTIwbW9yZSUyMENQVSUyMHRocmVhZHMlMjBmb3IlMjBDUFUlMjBsYXllcnM=",highlighted:`./server \\
-m smollm2-1.7b-instruct.Q4_K_M.gguf \\
--n-gpu-layers 20 \\ <span class="hljs-comment"># Keep first 20 layers on GPU</span>
--threads 8 <span class="hljs-comment"># Use more CPU threads for CPU layers</span>`,wrap:!1}}),E=new W({props:{code:"ZnJvbSUyMHZsbG0uZW5naW5lLmFyZ191dGlscyUyMGltcG9ydCUyMEFzeW5jRW5naW5lQXJncyUwQSUwQWVuZ2luZV9hcmdzJTIwJTNEJTIwQXN5bmNFbmdpbmVBcmdzKCUwQSUyMCUyMCUyMCUyMG1vZGVsJTNEJTIySHVnZ2luZ0ZhY2VUQiUyRlNtb2xMTTItMS43Qi1JbnN0cnVjdCUyMiUyQyUwQSUyMCUyMCUyMCUyMGdwdV9tZW1vcnlfdXRpbGl6YXRpb24lM0QwLjg1JTJDJTBBJTIwJTIwJTIwJTIwbWF4X251bV9iYXRjaGVkX3Rva2VucyUzRDgxOTIlMkMlMEElMjAlMjAlMjAlMjBibG9ja19zaXplJTNEMTYlMkMlMEEpJTBBJTBBbGxtJTIwJTNEJTIwTExNKGVuZ2luZV9hcmdzJTNEZW5naW5lX2FyZ3Mp",highlighted:`<span class="hljs-keyword">from</span> vllm.engine.arg_utils <span class="hljs-keyword">import</span> AsyncEngineArgs
engine_args = AsyncEngineArgs(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-1.7B-Instruct&quot;</span>,
gpu_memory_utilization=<span class="hljs-number">0.85</span>,
max_num_batched_tokens=<span class="hljs-number">8192</span>,
block_size=<span class="hljs-number">16</span>,
)
llm = LLM(engine_args=engine_args)`,wrap:!1}}),{c(){y=X(`<hfoption value="tgi" label="TGI">
`),r=w("p"),r.textContent=U,J=o(),g(T.$$.fragment),c=X(`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),m=w("p"),m.textContent=I,s=o(),g(M.$$.fragment),Q=o(),j=w("p"),j.textContent=V,H=o(),g(S.$$.fragment),cl=X(`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),N=w("p"),N.textContent=x,D=o(),g(E.$$.fragment),rl=X(`
</hfoption>`)},l(u){y=k(u,`<hfoption value="tgi" label="TGI">
`),r=h(u,"P",{"data-svelte-h":!0}),f(r)!=="svelte-1uwkewc"&&(r.textContent=U),J=i(u),B(T.$$.fragment,u),c=k(u,`
</hfoption>
<hfoption value="llama.cpp" label="llama.cpp">
`),m=h(u,"P",{"data-svelte-h":!0}),f(m)!=="svelte-f66vqu"&&(m.textContent=I),s=i(u),B(M.$$.fragment,u),Q=i(u),j=h(u,"P",{"data-svelte-h":!0}),f(j)!=="svelte-1m87csn"&&(j.textContent=V),H=i(u),B(S.$$.fragment,u),cl=k(u,`
</hfoption>
<hfoption value="vllm" label="vLLM">
`),N=h(u,"P",{"data-svelte-h":!0}),f(N)!=="svelte-6lsf9z"&&(N.textContent=x),D=i(u),B(E.$$.fragment,u),rl=k(u,`
</hfoption>`)},m(u,$){t(u,y,$),t(u,r,$),t(u,J,$),G(T,u,$),t(u,c,$),t(u,m,$),t(u,s,$),G(M,u,$),t(u,Q,$),t(u,j,$),t(u,H,$),G(S,u,$),t(u,cl,$),t(u,N,$),t(u,D,$),G(E,u,$),t(u,rl,$),R=!0},p:Gl,i(u){R||(d(T.$$.fragment,u),d(M.$$.fragment,u),d(S.$$.fragment,u),d(E.$$.fragment,u),R=!0)},o(u){C(T.$$.fragment,u),C(M.$$.fragment,u),C(S.$$.fragment,u),C(E.$$.fragment,u),R=!1},d(u){u&&(e(y),e(r),e(J),e(c),e(m),e(s),e(Q),e(j),e(H),e(cl),e(N),e(D),e(rl)),Z(T,u),Z(M,u),Z(S,u),Z(E,u)}}}function jt(A){let y,r,U,J,T,c,m,I="In this section, we’ll explore advanced frameworks for optimizing LLM deployments: Text Generation Inference (TGI), vLLM, and llama.cpp. These applications are primarily used in production environments to serve LLMs to users. This section focuses on how to deploy these frameworks in production rather than how to use them for inference on a single machine.",s,M,Q="We’ll cover how these tools maximize inference efficiency and simplify production deployments of Large Language Models.",j,V,H,S,cl="TGI, vLLM, and llama.cpp serve similar purposes but have distinct characteristics that make them better suited for different use cases. Let’s look at the key differences between them, focusing on performance and integration.",N,x,D,E,rl="<strong>TGI</strong> is designed to be stable and predictable in production, using fixed sequence lengths to keep memory usage consistent. TGI manages memory using Flash Attention 2 and continuous batching techniques. This means it can process attention calculations very efficiently and keep the GPU busy by constantly feeding it work. The system can move parts of the model between CPU and GPU when needed, which helps handle larger models.",R,u,$,el,q,tl,P,bl="<strong>vLLM</strong> takes a different approach by using PagedAttention. Just like how a computer manages its memory in pages, vLLM splits the model’s memory into smaller blocks. This clever system means it can handle different-sized requests more flexibly and doesn’t waste memory space. It’s particularly good at sharing memory between different requests and reduces memory fragmentation, which makes the whole system more efficient.",sl,Y,nl,K,dl="<strong>llama.cpp</strong> is a highly optimized C/C++ implementation originally designed for running LLaMA models on consumer hardware. It focuses on CPU efficiency with optional GPU acceleration and is ideal for resource-constrained environments. llama.cpp uses quantization techniques to reduce model size and memory requirements while maintaining good performance. It implements optimized kernels for various CPU architectures and supports basic KV cache management for efficient token generation.",al,z,Ml,O,jl,_,Jl="Let’s move on to the deployment and integration differences between the frameworks.",pl,ll,Cl="<strong>TGI</strong> excels in enterprise-level deployment with its production-ready features. It comes with built-in Kubernetes support and includes everything you need for running in production, like monitoring through Prometheus and Grafana, automatic scaling, and comprehensive safety features. The system also includes enterprise-grade logging and various protective measures like content filtering and rate limiting to keep your deployment secure and stable.",ol,F,Tl="<strong>vLLM</strong> takes a more flexible, developer-friendly approach to deployment. It’s built with Python at its core and can easily replace OpenAI’s API in your existing applications. The framework focuses on delivering raw performance and can be customized to fit your specific needs. It works particularly well with Ray for managing clusters, making it a great choice when you need high performance and adaptability.",wl,L,ml="<strong>llama.cpp</strong> prioritizes simplicity and portability. Its server implementation is lightweight and can run on a wide range of hardware, from powerful servers to consumer laptops and even some high-end mobile devices. With minimal dependencies and a simple C/C++ core, it’s easy to deploy in environments where installing Python frameworks would be challenging. The server provides an OpenAI-compatible API while maintaining a much smaller resource footprint than other solutions.",il,Ul,a,b,Sl="Let’s explore how to use these frameworks for deploying LLMs, starting with installation and basic setup.",hl,fl,gl,yl,Bl,Il,n,v,Ce="Let’s look at examples of text generation with the frameworks:",Ol,Zl,le,El,ee,$l,te,Nl,fe="The process of generating text involves selecting the next token at each step. This selection process can be controlled through various parameters:",se,Xl,ge="<li><strong>Raw Logits</strong>: The initial output probabilities for each token</li> <li><strong>Temperature</strong>: Controls randomness in selection (higher = more creative)</li> <li><strong>Top-p (Nucleus) Sampling</strong>: Filters to top tokens making up X% of probability mass</li> <li><strong>Top-k Filtering</strong>: Limits selection to k most likely tokens</li>",ne,kl,Be="Here’s how to configure these parameters:",ae,vl,Me,Rl,pe,xl,Ge="Both frameworks provide ways to prevent repetitive text generation:",oe,Al,ie,ql,ye,Yl,Ze="You can control generation length and specify when to stop:",ce,Ql,re,zl,Je,_l,ve="Both frameworks implement advanced memory management techniques for efficient inference.",me,Vl,Ue,Fl,Te,Ll,Ae='<li><a href="https://huggingface.co/docs/text-generation-inference" rel="nofollow">Text Generation Inference Documentation</a></li> <li><a href="https://github.com/huggingface/text-generation-inference" rel="nofollow">TGI GitHub Repository</a></li> <li><a href="https://vllm.readthedocs.io/" rel="nofollow">vLLM Documentation</a></li> <li><a href="https://github.com/vllm-project/vllm" rel="nofollow">vLLM GitHub Repository</a></li> <li><a href="https://arxiv.org/abs/2309.06180" rel="nofollow">PagedAttention Paper</a></li> <li><a href="https://github.com/ggerganov/llama.cpp" rel="nofollow">llama.cpp GitHub Repository</a></li> <li><a href="https://github.com/abetlen/llama-cpp-python" rel="nofollow">llama-cpp-python Repository</a></li>',ue,Hl,je,Pl,we;return T=new ul({props:{title:"Optimized Inference Deployment",local:"optimized-inference-deployment",headingTag:"h1"}}),V=new ul({props:{title:"Framework Selection Guide",local:"framework-selection-guide",headingTag:"h2"}}),x=new ul({props:{title:"Memory Management and Performance",local:"memory-management-and-performance",headingTag:"h3"}}),q=new be({props:{title:"How Flash Attention Works",$$slots:{default:[it]},$$scope:{ctx:A}}}),Y=new be({props:{title:"How PagedAttention Works",$$slots:{default:[yt]},$$scope:{ctx:A}}}),z=new be({props:{title:"How llama.cpp Quantization Works",$$slots:{default:[ct]},$$scope:{ctx:A}}}),O=new ul({props:{title:"Deployment and Integration",local:"deployment-and-integration",headingTag:"h3"}}),Ul=new ul({props:{title:"Getting Started",local:"getting-started",headingTag:"h2"}}),fl=new ul({props:{title:"Installation and Basic Setup",local:"installation-and-basic-setup",headingTag:"h3"}}),yl=new Dl({props:{id:"inference-frameworks",options:[],$$slots:{default:[rt]},$$scope:{ctx:A}}}),Il=new ul({props:{title:"Basic Text Generation",local:"basic-text-generation",headingTag:"h3"}}),Zl=new Dl({props:{id:"inference-frameworks",options:[],$$slots:{default:[Jt]},$$scope:{ctx:A}}}),El=new ul({props:{title:"Advanced Generation Control",local:"advanced-generation-control",headingTag:"h2"}}),$l=new ul({props:{title:"Token Selection and Sampling",local:"token-selection-and-sampling",headingTag:"h3"}}),vl=new Dl({props:{id:"inference-frameworks",options:[],$$slots:{default:[mt]},$$scope:{ctx:A}}}),Rl=new ul({props:{title:"Controlling Repetition",local:"controlling-repetition",headingTag:"h3"}}),Al=new Dl({props:{id:"inference-frameworks",options:[],$$slots:{default:[Ut]},$$scope:{ctx:A}}}),ql=new ul({props:{title:"Length Control and Stop Sequences",local:"length-control-and-stop-sequences",headingTag:"h3"}}),Ql=new Dl({props:{id:"inference-frameworks",options:[],$$slots:{default:[Tt]},$$scope:{ctx:A}}}),zl=new ul({props:{title:"Memory Management",local:"memory-management",headingTag:"h2"}}),Vl=new Dl({props:{id:"inference-frameworks",options:[],$$slots:{default:[ut]},$$scope:{ctx:A}}}),Fl=new ul({props:{title:"Resources",local:"resources",headingTag:"h2"}}),Hl=new nt({props:{source:"https://github.com/huggingface/course/blob/main/chapters/en/chapter2/8.mdx"}}),{c(){y=w("meta"),r=o(),U=w("p"),J=o(),g(T.$$.fragment),c=o(),m=w("p"),m.textContent=I,s=o(),M=w("p"),M.textContent=Q,j=o(),g(V.$$.fragment),H=o(),S=w("p"),S.textContent=cl,N=o(),g(x.$$.fragment),D=o(),E=w("p"),E.innerHTML=rl,R=o(),u=w("img"),el=o(),g(q.$$.fragment),tl=o(),P=w("p"),P.innerHTML=bl,sl=o(),g(Y.$$.fragment),nl=o(),K=w("p"),K.innerHTML=dl,al=o(),g(z.$$.fragment),Ml=o(),g(O.$$.fragment),jl=o(),_=w("p"),_.textContent=Jl,pl=o(),ll=w("p"),ll.innerHTML=Cl,ol=o(),F=w("p"),F.innerHTML=Tl,wl=o(),L=w("p"),L.innerHTML=ml,il=o(),g(Ul.$$.fragment),a=o(),b=w("p"),b.textContent=Sl,hl=o(),g(fl.$$.fragment),gl=o(),g(yl.$$.fragment),Bl=o(),g(Il.$$.fragment),n=o(),v=w("p"),v.textContent=Ce,Ol=o(),g(Zl.$$.fragment),le=o(),g(El.$$.fragment),ee=o(),g($l.$$.fragment),te=o(),Nl=w("p"),Nl.textContent=fe,se=o(),Xl=w("ol"),Xl.innerHTML=ge,ne=o(),kl=w("p"),kl.textContent=Be,ae=o(),g(vl.$$.fragment),Me=o(),g(Rl.$$.fragment),pe=o(),xl=w("p"),xl.textContent=Ge,oe=o(),g(Al.$$.fragment),ie=o(),g(ql.$$.fragment),ye=o(),Yl=w("p"),Yl.textContent=Ze,ce=o(),g(Ql.$$.fragment),re=o(),g(zl.$$.fragment),Je=o(),_l=w("p"),_l.textContent=ve,me=o(),g(Vl.$$.fragment),Ue=o(),g(Fl.$$.fragment),Te=o(),Ll=w("ul"),Ll.innerHTML=Ae,ue=o(),g(Hl.$$.fragment),je=o(),Pl=w("p"),this.h()},l(l){const p=st("svelte-u9bgzb",document.head);y=h(p,"META",{name:!0,content:!0}),p.forEach(e),r=i(l),U=h(l,"P",{}),Kl(U).forEach(e),J=i(l),B(T.$$.fragment,l),c=i(l),m=h(l,"P",{"data-svelte-h":!0}),f(m)!=="svelte-1c3ix4n"&&(m.textContent=I),s=i(l),M=h(l,"P",{"data-svelte-h":!0}),f(M)!=="svelte-1xh6xvz"&&(M.textContent=Q),j=i(l),B(V.$$.fragment,l),H=i(l),S=h(l,"P",{"data-svelte-h":!0}),f(S)!=="svelte-xxm0i8"&&(S.textContent=cl),N=i(l),B(x.$$.fragment,l),D=i(l),E=h(l,"P",{"data-svelte-h":!0}),f(E)!=="svelte-ms8d3b"&&(E.innerHTML=rl),R=i(l),u=h(l,"IMG",{src:!0,alt:!0}),el=i(l),B(q.$$.fragment,l),tl=i(l),P=h(l,"P",{"data-svelte-h":!0}),f(P)!=="svelte-11etipa"&&(P.innerHTML=bl),sl=i(l),B(Y.$$.fragment,l),nl=i(l),K=h(l,"P",{"data-svelte-h":!0}),f(K)!=="svelte-19a6l8z"&&(K.innerHTML=dl),al=i(l),B(z.$$.fragment,l),Ml=i(l),B(O.$$.fragment,l),jl=i(l),_=h(l,"P",{"data-svelte-h":!0}),f(_)!=="svelte-o1p1av"&&(_.textContent=Jl),pl=i(l),ll=h(l,"P",{"data-svelte-h":!0}),f(ll)!=="svelte-l34lqu"&&(ll.innerHTML=Cl),ol=i(l),F=h(l,"P",{"data-svelte-h":!0}),f(F)!=="svelte-1isy26s"&&(F.innerHTML=Tl),wl=i(l),L=h(l,"P",{"data-svelte-h":!0}),f(L)!=="svelte-1gsq6x1"&&(L.innerHTML=ml),il=i(l),B(Ul.$$.fragment,l),a=i(l),b=h(l,"P",{"data-svelte-h":!0}),f(b)!=="svelte-1k0ewbc"&&(b.textContent=Sl),hl=i(l),B(fl.$$.fragment,l),gl=i(l),B(yl.$$.fragment,l),Bl=i(l),B(Il.$$.fragment,l),n=i(l),v=h(l,"P",{"data-svelte-h":!0}),f(v)!=="svelte-yy57k8"&&(v.textContent=Ce),Ol=i(l),B(Zl.$$.fragment,l),le=i(l),B(El.$$.fragment,l),ee=i(l),B($l.$$.fragment,l),te=i(l),Nl=h(l,"P",{"data-svelte-h":!0}),f(Nl)!=="svelte-1jdaj55"&&(Nl.textContent=fe),se=i(l),Xl=h(l,"OL",{"data-svelte-h":!0}),f(Xl)!=="svelte-1j60hyx"&&(Xl.innerHTML=ge),ne=i(l),kl=h(l,"P",{"data-svelte-h":!0}),f(kl)!=="svelte-nakymk"&&(kl.textContent=Be),ae=i(l),B(vl.$$.fragment,l),Me=i(l),B(Rl.$$.fragment,l),pe=i(l),xl=h(l,"P",{"data-svelte-h":!0}),f(xl)!=="svelte-euetng"&&(xl.textContent=Ge),oe=i(l),B(Al.$$.fragment,l),ie=i(l),B(ql.$$.fragment,l),ye=i(l),Yl=h(l,"P",{"data-svelte-h":!0}),f(Yl)!=="svelte-1ut8dfv"&&(Yl.textContent=Ze),ce=i(l),B(Ql.$$.fragment,l),re=i(l),B(zl.$$.fragment,l),Je=i(l),_l=h(l,"P",{"data-svelte-h":!0}),f(_l)!=="svelte-b9x09v"&&(_l.textContent=ve),me=i(l),B(Vl.$$.fragment,l),Ue=i(l),B(Fl.$$.fragment,l),Te=i(l),Ll=h(l,"UL",{"data-svelte-h":!0}),f(Ll)!=="svelte-15h1dzu"&&(Ll.innerHTML=Ae),ue=i(l),B(Hl.$$.fragment,l),je=i(l),Pl=h(l,"P",{}),Kl(Pl).forEach(e),this.h()},h(){Wl(y,"name","hf:doc:metadata"),Wl(y,"content",wt),Oe(u.src,$="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png")||Wl(u,"src",$),Wl(u,"alt","Flash Attention")},m(l,p){de(document.head,y),t(l,r,p),t(l,U,p),t(l,J,p),G(T,l,p),t(l,c,p),t(l,m,p),t(l,s,p),t(l,M,p),t(l,j,p),G(V,l,p),t(l,H,p),t(l,S,p),t(l,N,p),G(x,l,p),t(l,D,p),t(l,E,p),t(l,R,p),t(l,u,p),t(l,el,p),G(q,l,p),t(l,tl,p),t(l,P,p),t(l,sl,p),G(Y,l,p),t(l,nl,p),t(l,K,p),t(l,al,p),G(z,l,p),t(l,Ml,p),G(O,l,p),t(l,jl,p),t(l,_,p),t(l,pl,p),t(l,ll,p),t(l,ol,p),t(l,F,p),t(l,wl,p),t(l,L,p),t(l,il,p),G(Ul,l,p),t(l,a,p),t(l,b,p),t(l,hl,p),G(fl,l,p),t(l,gl,p),G(yl,l,p),t(l,Bl,p),G(Il,l,p),t(l,n,p),t(l,v,p),t(l,Ol,p),G(Zl,l,p),t(l,le,p),G(El,l,p),t(l,ee,p),G($l,l,p),t(l,te,p),t(l,Nl,p),t(l,se,p),t(l,Xl,p),t(l,ne,p),t(l,kl,p),t(l,ae,p),G(vl,l,p),t(l,Me,p),G(Rl,l,p),t(l,pe,p),t(l,xl,p),t(l,oe,p),G(Al,l,p),t(l,ie,p),G(ql,l,p),t(l,ye,p),t(l,Yl,p),t(l,ce,p),G(Ql,l,p),t(l,re,p),G(zl,l,p),t(l,Je,p),t(l,_l,p),t(l,me,p),G(Vl,l,p),t(l,Ue,p),G(Fl,l,p),t(l,Te,p),t(l,Ll,p),t(l,ue,p),G(Hl,l,p),t(l,je,p),t(l,Pl,p),we=!0},p(l,[p]){const Qe={};p&2&&(Qe.$$scope={dirty:p,ctx:l}),q.$set(Qe);const Ve={};p&2&&(Ve.$$scope={dirty:p,ctx:l}),Y.$set(Ve);const We={};p&2&&(We.$$scope={dirty:p,ctx:l}),z.$set(We);const Se={};p&2&&(Se.$$scope={dirty:p,ctx:l}),yl.$set(Se);const Ee={};p&2&&(Ee.$$scope={dirty:p,ctx:l}),Zl.$set(Ee);const $e={};p&2&&($e.$$scope={dirty:p,ctx:l}),vl.$set($e);const Ne={};p&2&&(Ne.$$scope={dirty:p,ctx:l}),Al.$set(Ne);const Xe={};p&2&&(Xe.$$scope={dirty:p,ctx:l}),Ql.$set(Xe);const ke={};p&2&&(ke.$$scope={dirty:p,ctx:l}),Vl.$set(ke)},i(l){we||(d(T.$$.fragment,l),d(V.$$.fragment,l),d(x.$$.fragment,l),d(q.$$.fragment,l),d(Y.$$.fragment,l),d(z.$$.fragment,l),d(O.$$.fragment,l),d(Ul.$$.fragment,l),d(fl.$$.fragment,l),d(yl.$$.fragment,l),d(Il.$$.fragment,l),d(Zl.$$.fragment,l),d(El.$$.fragment,l),d($l.$$.fragment,l),d(vl.$$.fragment,l),d(Rl.$$.fragment,l),d(Al.$$.fragment,l),d(ql.$$.fragment,l),d(Ql.$$.fragment,l),d(zl.$$.fragment,l),d(Vl.$$.fragment,l),d(Fl.$$.fragment,l),d(Hl.$$.fragment,l),we=!0)},o(l){C(T.$$.fragment,l),C(V.$$.fragment,l),C(x.$$.fragment,l),C(q.$$.fragment,l),C(Y.$$.fragment,l),C(z.$$.fragment,l),C(O.$$.fragment,l),C(Ul.$$.fragment,l),C(fl.$$.fragment,l),C(yl.$$.fragment,l),C(Il.$$.fragment,l),C(Zl.$$.fragment,l),C(El.$$.fragment,l),C($l.$$.fragment,l),C(vl.$$.fragment,l),C(Rl.$$.fragment,l),C(Al.$$.fragment,l),C(ql.$$.fragment,l),C(Ql.$$.fragment,l),C(zl.$$.fragment,l),C(Vl.$$.fragment,l),C(Fl.$$.fragment,l),C(Hl.$$.fragment,l),we=!1},d(l){l&&(e(r),e(U),e(J),e(c),e(m),e(s),e(M),e(j),e(H),e(S),e(N),e(D),e(E),e(R),e(u),e(el),e(tl),e(P),e(sl),e(nl),e(K),e(al),e(Ml),e(jl),e(_),e(pl),e(ll),e(ol),e(F),e(wl),e(L),e(il),e(a),e(b),e(hl),e(gl),e(Bl),e(n),e(v),e(Ol),e(le),e(ee),e(te),e(Nl),e(se),e(Xl),e(ne),e(kl),e(ae),e(Me),e(pe),e(xl),e(oe),e(ie),e(ye),e(Yl),e(ce),e(re),e(Je),e(_l),e(me),e(Ue),e(Te),e(Ll),e(ue),e(je),e(Pl)),e(y),Z(T,l),Z(V,l),Z(x,l),Z(q,l),Z(Y,l),Z(z,l),Z(O,l),Z(Ul,l),Z(fl,l),Z(yl,l),Z(Il,l),Z(Zl,l),Z(El,l),Z($l,l),Z(vl,l),Z(Rl,l),Z(Al,l),Z(ql,l),Z(Ql,l),Z(zl,l),Z(Vl,l),Z(Fl,l),Z(Hl,l)}}}const wt='{"title":"Optimized Inference Deployment","local":"optimized-inference-deployment","sections":[{"title":"Framework Selection Guide","local":"framework-selection-guide","sections":[{"title":"Memory Management and Performance","local":"memory-management-and-performance","sections":[],"depth":3},{"title":"Deployment and Integration","local":"deployment-and-integration","sections":[],"depth":3}],"depth":2},{"title":"Getting Started","local":"getting-started","sections":[{"title":"Installation and Basic Setup","local":"installation-and-basic-setup","sections":[],"depth":3},{"title":"Basic Text Generation","local":"basic-text-generation","sections":[],"depth":3}],"depth":2},{"title":"Advanced Generation Control","local":"advanced-generation-control","sections":[{"title":"Token Selection and Sampling","local":"token-selection-and-sampling","sections":[],"depth":3},{"title":"Controlling Repetition","local":"controlling-repetition","sections":[],"depth":3},{"title":"Length Control and Stop Sequences","local":"length-control-and-stop-sequences","sections":[],"depth":3}],"depth":2},{"title":"Memory Management","local":"memory-management","sections":[],"depth":2},{"title":"Resources","local":"resources","sections":[],"depth":2}],"depth":1}';function ht(A){return ze(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Gt extends _e{constructor(y){super(),Fe(this,y,ht,jt,Ye,{})}}export{Gt as component};

Xet Storage Details

Size:
101 kB
·
Xet hash:
3829b9665a1c13dd7d9478742638b8ad629380e3a502289b775f6683df381b11

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.