Buckets:

rtrm's picture
download
raw
119 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;ఆప్టిమైజ్డ్ ఇన్ఫరెన్స్ డిప్లాయ్‌మెంట్&quot;,&quot;local&quot;:&quot;ఆపటమజడ-ఇనఫరనస-డపలయమట&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Framework ఎంపిక గైడ్&quot;,&quot;local&quot;:&quot;framework-ఎపక-గడ&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;మెమరీ మేనేజ్‌మెంట్ మరియు పనితీరు&quot;,&quot;local&quot;:&quot;మమర-మనజమట-మరయ-పనతర&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Deployment మరియు Integration&quot;,&quot;local&quot;:&quot;deployment-మరయ-integration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;ప్రారంభించడం&quot;,&quot;local&quot;:&quot;పరరభచడ&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;సంస్థాపన మరియు ప్రాథమిక సెటప్&quot;,&quot;local&quot;:&quot;ససథపన-మరయ-పరథమక-సటప&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;ప్రాథమిక టెక్స్ట్ జనరేషన్&quot;,&quot;local&quot;:&quot;పరథమక-టకసట-జనరషన&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;అధునాతన జనరేషన్ నియంత్రణ&quot;,&quot;local&quot;:&quot;అధనతన-జనరషన-నయతరణ&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Token ఎంపిక మరియు Sampling&quot;,&quot;local&quot;:&quot;token-ఎపక-మరయ-sampling&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;పునరావృతం నివారణ (Controlling Repetition)&quot;,&quot;local&quot;:&quot;పనరవత-నవరణ-controlling-repetition&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;పొడవు నియంత్రణ మరియు Stop Sequences&quot;,&quot;local&quot;:&quot;పడవ-నయతరణ-మరయ-stop-sequences&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;మెమరీ నిర్వహణ&quot;,&quot;local&quot;:&quot;మమర-నరవహణ&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;వనరులు (Resources)&quot;,&quot;local&quot;:&quot;వనరల-resources&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1149/te/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/entry/start.eee4c1e0.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/scheduler.cc52f4b9.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/singletons.69904a63.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/index.5033808a.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/paths.e8418a83.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/entry/app.0f38558a.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/preload-helper.ae923e55.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/index.bd400c31.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/nodes/0.bd814f6d.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/nodes/21.568818e3.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/Tip.272e6bd8.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.25fe1ea6.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/CodeBlock.1794e33c.js">
<link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/stores.ac56eab6.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;ఆప్టిమైజ్డ్ ఇన్ఫరెన్స్ డిప్లాయ్‌మెంట్&quot;,&quot;local&quot;:&quot;ఆపటమజడ-ఇనఫరనస-డపలయమట&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Framework ఎంపిక గైడ్&quot;,&quot;local&quot;:&quot;framework-ఎపక-గడ&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;మెమరీ మేనేజ్‌మెంట్ మరియు పనితీరు&quot;,&quot;local&quot;:&quot;మమర-మనజమట-మరయ-పనతర&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Deployment మరియు Integration&quot;,&quot;local&quot;:&quot;deployment-మరయ-integration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;ప్రారంభించడం&quot;,&quot;local&quot;:&quot;పరరభచడ&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;సంస్థాపన మరియు ప్రాథమిక సెటప్&quot;,&quot;local&quot;:&quot;ససథపన-మరయ-పరథమక-సటప&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;ప్రాథమిక టెక్స్ట్ జనరేషన్&quot;,&quot;local&quot;:&quot;పరథమక-టకసట-జనరషన&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;అధునాతన జనరేషన్ నియంత్రణ&quot;,&quot;local&quot;:&quot;అధనతన-జనరషన-నయతరణ&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Token ఎంపిక మరియు Sampling&quot;,&quot;local&quot;:&quot;token-ఎపక-మరయ-sampling&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;పునరావృతం నివారణ (Controlling Repetition)&quot;,&quot;local&quot;:&quot;పనరవత-నవరణ-controlling-repetition&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;పొడవు నియంత్రణ మరియు Stop Sequences&quot;,&quot;local&quot;:&quot;పడవ-నయతరణ-మరయ-stop-sequences&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;మెమరీ నిర్వహణ&quot;,&quot;local&quot;:&quot;మమర-నరవహణ&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;వనరులు (Resources)&quot;,&quot;local&quot;:&quot;వనరల-resources&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="ఆపటమజడ-ఇనఫరనస-డపలయమట" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ఆపటమజడ-ఇనఫరనస-డపలయమట"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ఆప్టిమైజ్డ్ ఇన్ఫరెన్స్ డిప్లాయ్‌మెంట్</span></h1> <p data-svelte-h="svelte-s9n4v7">ఈ విభాగంలో, LLM deployments ను ఆప్టిమైజ్ చేయడానికి ఉపయోగించే ఆధునిక frameworks అయిన Text Generation Inference (TGI), vLLM, మరియు llama.cpp గురించి తెలుసుకుందాం.<br>
ఈ అప్లికేషన్లు ముఖ్యంగా production పరిసరాల్లో LLMలను వినియోగదారులకు సర్వ్ చేయడానికి ఉపయోగిస్తారు. ఈ విభాగం ఒకే మెషిన్‌పై inference ఎలా చేయాలో కాదు, production లో ఈ frameworks ను ఎలా deploy చేయాలో మీద దృష్టి పెడుతుంది.</p> <p data-svelte-h="svelte-9t17eo">ఈ tools inference సామర్థ్యాన్ని ఎలా గరిష్టం చేస్తాయి మరియు Large Language Models యొక్క production deployments ను ఎలా సులభతరం చేస్తాయి అనే విషయాలపై చర్చిస్తాం.</p> <h2 class="relative group"><a id="framework-ఎపక-గడ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#framework-ఎపక-గడ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Framework ఎంపిక గైడ్</span></h2> <p data-svelte-h="svelte-pgbai6">TGI, vLLM, మరియు llama.cpp ఒకే విధమైన లక్ష్యంతో పనిచేస్తున్నా, ప్రతి framework కి వేర్వేరు లక్షణాలు ఉన్నాయి, ఇవి ప్రత్యేక use cases కు బాగా సరిపోతాయి.<br>
ఇప్పుడు, వాటి ప్రధాన తేడాలను — ముఖ్యంగా performance మరియు integration పరంగా — పరిశీలిద్దాం.</p> <h3 class="relative group"><a id="మమర-మనజమట-మరయ-పనతర" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#మమర-మనజమట-మరయ-పనతర"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>మెమరీ మేనేజ్‌మెంట్ మరియు పనితీరు</span></h3> <p data-svelte-h="svelte-1hkgjet"><strong>TGI</strong> production కోసం స్థిరమైన, predictable ప్రవర్తన అందించేలా రూపొందించబడింది. ఇది GPU మెమరీ వినియోగాన్ని స్థిరంగా ఉంచేందుకు fixed sequence lengths ఉపయోగిస్తుంది.<br>
TGI, Flash Attention 2 మరియు continuous batching వంటి పద్ధతులతో మెమరీని సమర్థవంతంగా నిర్వహిస్తుంది. దీనివల్ల attention calculations వేగవంతమవుతాయి మరియు GPU నిరుత్సాహంగా ఉండే సమయం తగ్గుతుంది.<br>
అవసరమైతే, సిస్టమ్ మోడల్‌లోని కొన్ని భాగాలను CPU మరియు GPU మధ్య మార్చగలదు, ఇది పెద్ద మోడళ్లను కూడా హ్యాండిల్ చేయడానికి సహాయపడుతుంది.</p> <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png" alt="Flash Attention"> <blockquote class="tip"><p data-svelte-h="svelte-1g4f572">Flash Attention అనేది transformer models లోని attention mechanism ను ఆప్టిమైజ్ చేయడానికి ఉపయోగించే సాంకేతికత.<br>
మనం <a href="/course/chapter1/8">అధ్యాయం 1.8</a> లో చర్చించినట్టుగా, attention కి quadratic complexity ఉండటం వలన, దీని computation మరియు memory వినియోగం ఎక్కువగా ఉంటుంది.</p> <p data-svelte-h="svelte-5pvn3e">Flash Attention యొక్క ప్రధాన ఆవిష్కరణ HBM (High Bandwidth Memory) మరియు SRAM cache మధ్య memory transfer ను తగ్గించడంలో ఉంది. సాధారణ attention లో ఈ transfers చాలా సార్లు జరుగుతూ bottleneck అవుతాయి.<br>
Flash Attention డేటాను ఒక్కసారి SRAM లోకి లోడ్ చేసి, calculations అంతా అక్కడే పూర్తి చేస్తుంది, దాంతో memory overhead చాలా తగ్గుతుంది.</p> <p data-svelte-h="svelte-716x3h">ఈ ప్రయోజనాలు training సమయంలో ఎక్కువగా కనిపించినా, inference సమయంలో కూడా VRAM వినియోగం తగ్గడం మరియు వేగం పెరగడం వంటి ప్రయోజనాలు అందిస్తుంది.</p></blockquote> <p data-svelte-h="svelte-7jk8pa"><strong>vLLM</strong> పూర్తిగా వేరు విధానాన్ని అనుసరిస్తుంది — దీని ప్రత్యేకత <em>PagedAttention</em>.<br>
ఇది కంప్యూటర్ virtual memory లానే పనిచేస్తుంది: మోడల్ memory ను చిన్న చిన్న “pages” గా విభజిస్తుంది. దీని ద్వారా requests వేర్వేరు sizes అయినా memory ని వృథా చేయకుండా హ్యాండిల్ చేయవచ్చు.<br>
ఇది memory fragmentation ను తగ్గిస్తూ, KV cache ను సమర్థవంతంగా నిర్వహిస్తుంది, తద్వారా throughput భారీగా పెరుగుతుంది.</p> <blockquote class="tip"><p data-svelte-h="svelte-1yldomh">PagedAttention, KV cache నిర్వహణలో ఉండే bottlenecks ను పరిష్కరించడానికి రూపొందించబడింది.<br>
LLM generation సమయంలో, ప్రతి token కి keys మరియు values (KV cache) నిల్వ చేయాలి. దీని memory చాలా పెద్దది అవుతుంది — ముఖ్యంగా long sequences లేదా concurrent requests వద్ద.</p> <p data-svelte-h="svelte-1r7ng17">vLLM యొక్క కీలక ఆవిష్కరణలు ఇవి:</p> <ol data-svelte-h="svelte-1nel57u"><li><strong>Memory Paging</strong> – KV cache ను పెద్ద continuous block లాగా కాకుండా చిన్న pages గా విభజించడం</li> <li><strong>Non-contiguous Storage</strong> – GPU memory లో pages continuous గా ఉండాల్సిన అవసరం లేదు</li> <li><strong>Page Table</strong> – ఏ pages ఏ sequence కు చెందినవి అనే సమాచారం నిర్వహించడం</li> <li><strong>Memory Sharing</strong> – ఒక prompt కి సంబంధించిన KV cache pages ని అనేక sequences మధ్య share చేయడం</li></ol> <p data-svelte-h="svelte-1pwyoq">ఈ పద్ధతి సంప్రదాయ inference పద్ధతుల కంటే <em>24x వరకు</em> ఎక్కువ throughput అందించగలదు.<br>
వివరంగా తెలుసుకోవాలంటే, <a href="https://docs.vllm.ai/en/latest/design/kernel/paged_attention.html" rel="nofollow">vLLM documentation</a> చదవవచ్చు.</p></blockquote> <p data-svelte-h="svelte-7emzsq"><strong>llama.cpp</strong> అనేది అత్యంత ఆప్టిమైజ్ చేయబడిన C/C++ implementation. మొదట ఇది consumer hardware పై LLaMA మోడళ్లను రన్ చేయడానికి రూపొందించబడింది.<br>
ఇది CPU పై అత్యంత సమర్థవంతంగా పనిచేస్తుంది, అవసరమైతే GPU acceleration కూడా అందిస్తుంది.<br>
llama.cpp మోడల్ ను quantize చేసి చిన్న పరిమాణంలోకి మార్చుతుంది — అలా VRAM వినియోగం తగ్గి inference వేగం పెరుగుతుంది.</p> <blockquote class="tip"><p data-svelte-h="svelte-1x0yrvo">Quantization అనేది మోడల్ weights ను FP32/FP16 నుండి తక్కువ precision (INT8, 4-bit మొదలైనవి) కు మార్చే ప్రక్రియ.<br>
ఇది memory వినియోగాన్ని గణనీయంగా తగ్గించి inference వేగాన్ని పెంచుతుంది., accuracy లో తక్కువ నష్టం మాత్రమే ఉంటుంది.</p> <p data-svelte-h="svelte-h1xlnc">llama.cpp లోని ముఖ్యమైన quantization ప్రయోజనాలు:</p> <ol data-svelte-h="svelte-y1mhpg"><li><strong>అనేక precision స్థాయిలు</strong> – INT8, 4-bit, 3-bit, 2-bit వరకు</li> <li><strong>GGML/GGUF ఫార్మాట్లు</strong> – quantized inference కోసం ఆప్టిమైజ్ చేసిన టెన్సర్ ఫార్మాట్లు</li> <li><strong>Mixed precision</strong> – మోడల్‌లో వేర్వేరు భాగాలకు వేర్వేరు quantization స్థాయిలు</li> <li><strong>CPU optimizations</strong> – AVX2, AVX-512, NEON వంటి CPU నిర్మాణాల (architectures) కోసం ఆప్టిమైజ్ చేసిన kernels</li></ol> <p data-svelte-h="svelte-1ov3ssy">ఈ విధానం, తక్కువ memory ఉన్న consumer devices పై కూడా పెద్ద మోడళ్లను రన్ చేయడానికి మార్గాన్ని అందిస్తుంది.</p></blockquote> <h3 class="relative group"><a id="deployment-మరయ-integration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deployment-మరయ-integration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deployment మరియు Integration</span></h3> <p data-svelte-h="svelte-99dvxk">ఇప్పుడు frameworks deployment మరియు integration పరంగా ఎలా భిన్నంగా ఉన్నాయో చూద్దాం.</p> <p data-svelte-h="svelte-drewzd"><strong>TGI</strong> enterprise-స్థాయి deployment లో అత్యుత్తమం.<br>
ఇది production కి అవసరమైన వాటిని built-in గా అందిస్తుంది — Kubernetes support, monitoring (Prometheus/Grafana), autoscaling, content filtering, rate limiting, security features మొదలైనవి.<br>
అంతేకాకుండా enterprise-grade logging కూడా కలిగి ఉంది.</p> <p data-svelte-h="svelte-eauoi0"><strong>vLLM</strong> flexible మరియు developer-friendly గా రూపొందించబడింది.<br>
ఇది Python ఆధారంగా పనిచేస్తుంది మరియు మీ existing applications లో OpenAI API స్థానంలో సులభంగా plug చేయవచ్చు.<br>
Clusters నిర్వహణ కోసం Ray తో బాగా పని చేస్తుంది.</p> <p data-svelte-h="svelte-mbxtpt"><strong>llama.cpp</strong> సాదాసీదా, తేలికైన server implementation కలిగి ఉంది.<br>
Python frameworks ను install చేయడం కష్టమైన పరిసరాల్లో కూడా deployment సులభం.<br>
ఇది OpenAI-compatible API కూడా అందిస్తుంది, కానీ resource వినియోగం చాలా తక్కువ.</p> <h2 class="relative group"><a id="పరరభచడ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#పరరభచడ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ప్రారంభించడం</span></h2> <p data-svelte-h="svelte-14g2ual">ఇప్పుడు ఈ frameworks ను ఎలా ఉపయోగించాలో — సంస్థాపన (installation) నుండి deployment వరకూ — చూద్దాం.</p> <h3 class="relative group"><a id="ససథపన-మరయ-పరథమక-సటప" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ససథపన-మరయ-పరథమక-సటప"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>సంస్థాపన మరియు ప్రాథమిక సెటప్</span></h3> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select">&lt;hfoption value=&quot;tgi&quot; label=&quot;TGI&quot;&gt;
<p data-svelte-h="svelte-16pya07">TGI ను Hugging Face ecosystem తో బాగా integrate చేశారు, మరియు సంస్థాపన చాలా సులభం.</p> <p data-svelte-h="svelte-cv57m1">మొదట, Docker తో TGI server ను ప్రారంభించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->docker run --gpus all \
--shm-size 1g \
-p 8080:80 \
-v ~/.cache/huggingface:/data \
ghcr.io/huggingface/text-generation-inference:latest \
--model-id HuggingFaceTB/SmolLM2-360M-Instruct<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1gzvg67">తరువాత Hugging Face InferenceClient తో ఇంటరాక్ట్ అవండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
<span class="hljs-comment"># Initialize client pointing to TGI endpoint</span>
client = InferenceClient(
model=<span class="hljs-string">&quot;http://localhost:8080&quot;</span>, <span class="hljs-comment"># URL to the TGI server</span>
)
<span class="hljs-comment"># Text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Tell me a story&quot;</span>,
max_new_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
details=<span class="hljs-literal">True</span>,
stop_sequences=[],
)
<span class="hljs-built_in">print</span>(response.generated_text)
<span class="hljs-comment"># For chat format</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-o8nlfq">లేదా OpenAI client ఉపయోగించవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
<span class="hljs-comment"># Initialize client pointing to TGI endpoint</span>
client = OpenAI(
base_url=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>, <span class="hljs-comment"># Make sure to include /v1</span>
api_key=<span class="hljs-string">&quot;not-needed&quot;</span>, <span class="hljs-comment"># TGI doesn&#x27;t require an API key by default</span>
)
<span class="hljs-comment"># Chat completion</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-360M-Instruct&quot;</span>,
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;llama.cpp&quot; label=&quot;llama.cpp&quot;&gt;
<p data-svelte-h="svelte-ci9eva">llama.cpp సంస్థాపన చాలా తేలికగా ఉంటుంది; CPU మరియు GPU inference రెండింటిని సపోర్ట్ చేస్తుంది.</p> <p data-svelte-h="svelte-t6wddc">మొదట, llama.cpp ని build చేయండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Clone the repository</span>
git <span class="hljs-built_in">clone</span> https://github.com/ggerganov/llama.cpp
<span class="hljs-built_in">cd</span> llama.cpp
<span class="hljs-comment"># Build the project</span>
make
<span class="hljs-comment"># Download the SmolLM2-1.7B-Instruct-GGUF model</span>
curl -L -O https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF/resolve/main/smollm2-1.7b-instruct.Q4_K_M.gguf<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-rlmx1m">OpenAI-compatible server ను ప్రారంభించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Start the server</span>
./server \
-m smollm2-1.7b-instruct.Q4_K_M.gguf \
--host 0.0.0.0 \
--port 8080 \
-c 4096 \
--n-gpu-layers 0 <span class="hljs-comment"># Set to a higher number to use GPU</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-123ki15">InferenceClient తో interact చేయండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
<span class="hljs-comment"># Initialize client pointing to llama.cpp server</span>
client = InferenceClient(
model=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>, <span class="hljs-comment"># URL to the llama.cpp server</span>
token=<span class="hljs-string">&quot;sk-no-key-required&quot;</span>, <span class="hljs-comment"># llama.cpp server requires this placeholder</span>
)
<span class="hljs-comment"># Text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Tell me a story&quot;</span>,
max_new_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
details=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">print</span>(response.generated_text)
<span class="hljs-comment"># For chat format</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1app4cd">లేదా OpenAI client ఉపయోగించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
<span class="hljs-comment"># Initialize client pointing to llama.cpp server</span>
client = OpenAI(
base_url=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>,
api_key=<span class="hljs-string">&quot;sk-no-key-required&quot;</span>, <span class="hljs-comment"># llama.cpp server requires this placeholder</span>
)
<span class="hljs-comment"># Chat completion</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;smollm2-1.7b-instruct&quot;</span>, <span class="hljs-comment"># Model identifier can be anything as server only loads one model</span>
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;vllm&quot; label=&quot;vLLM&quot;&gt;
<p data-svelte-h="svelte-1jetjxb">vLLM సంస్థాపన కూడా అత్యంత సులభం. ఇది OpenAI-compatible API మరియు native Python interface రెండింటినీ అందిస్తుంది.</p> <p data-svelte-h="svelte-1n9fznx">మొదట, vLLM server ను ప్రారంభించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python -m vllm.entrypoints.openai.api_server \
--model HuggingFaceTB/SmolLM2-360M-Instruct \
--host 0.0.0.0 \
--port 8000<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1eiitrc">InferenceClient తో ఉపయోగించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
<span class="hljs-comment"># Initialize client pointing to vLLM endpoint</span>
client = InferenceClient(
model=<span class="hljs-string">&quot;http://localhost:8000/v1&quot;</span>, <span class="hljs-comment"># URL to the vLLM server</span>
)
<span class="hljs-comment"># Text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Tell me a story&quot;</span>,
max_new_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
details=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">print</span>(response.generated_text)
<span class="hljs-comment"># For chat format</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xb5y1g">లేదా OpenAI client:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
<span class="hljs-comment"># Initialize client pointing to vLLM endpoint</span>
client = OpenAI(
base_url=<span class="hljs-string">&quot;http://localhost:8000/v1&quot;</span>,
api_key=<span class="hljs-string">&quot;not-needed&quot;</span>, <span class="hljs-comment"># vLLM doesn&#x27;t require an API key by default</span>
)
<span class="hljs-comment"># Chat completion</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-360M-Instruct&quot;</span>,
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Tell me a story&quot;</span>},
],
max_tokens=<span class="hljs-number">100</span>,
temperature=<span class="hljs-number">0.7</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;</div> <h3 class="relative group"><a id="పరథమక-టకసట-జనరషన" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#పరథమక-టకసట-జనరషన"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ప్రాథమిక టెక్స్ట్ జనరేషన్</span></h3> <p data-svelte-h="svelte-ik0sww">ఇప్పుడు frameworks లో text generation ఎలా చేయాలో చూద్దాం.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select">&lt;hfoption value=&quot;tgi&quot; label=&quot;TGI&quot;&gt;
<p data-svelte-h="svelte-93ghn8">మొదట, అభివృద్ధి చెందిన parameters తో TGI deploy చేయండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->docker run --gpus all \
--shm-size 1g \
-p 8080:80 \
-v ~/.cache/huggingface:/data \
ghcr.io/huggingface/text-generation-inference:latest \
--model-id HuggingFaceTB/SmolLM2-360M-Instruct \
--max-total-tokens 4096 \
--max-input-length 3072 \
--max-batch-total-tokens 8192 \
--waiting-served-ratio 1.2<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-12p45o1">InferenceClient తో generation చేయండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
client = InferenceClient(model=<span class="hljs-string">&quot;http://localhost:8080&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>,
max_tokens=<span class="hljs-number">200</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)
<span class="hljs-comment"># Raw text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Write a creative story about space exploration&quot;</span>,
max_new_tokens=<span class="hljs-number">200</span>,
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
repetition_penalty=<span class="hljs-number">1.1</span>,
do_sample=<span class="hljs-literal">True</span>,
details=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xb5y1g">లేదా OpenAI client:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
client = OpenAI(base_url=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>, api_key=<span class="hljs-string">&quot;not-needed&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-360M-Instruct&quot;</span>,
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;llama.cpp&quot; label=&quot;llama.cpp&quot;&gt;
<p data-svelte-h="svelte-1wheeko">llama.cpp లో server launch సమయంలో advanced parameters సెట్ చేయవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->./server \
-m smollm2-1.7b-instruct.Q4_K_M.gguf \
--host 0.0.0.0 \
--port 8080 \
-c 4096 \ <span class="hljs-comment"># Context size</span>
--threads 8 \ <span class="hljs-comment"># CPU threads to use</span>
--batch-size 512 \ <span class="hljs-comment"># Batch size for prompt evaluation</span>
--n-gpu-layers 0 <span class="hljs-comment"># GPU layers (0 = CPU only)</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1eiitrc">InferenceClient తో ఉపయోగించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
client = InferenceClient(model=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>, token=<span class="hljs-string">&quot;sk-no-key-required&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>,
max_tokens=<span class="hljs-number">200</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)
<span class="hljs-comment"># For direct text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Write a creative story about space exploration&quot;</span>,
max_new_tokens=<span class="hljs-number">200</span>,
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
repetition_penalty=<span class="hljs-number">1.1</span>,
details=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-11yv55v">లేదా నమూనా పారామితులపై నియంత్రణతో జనరేషన్ కోసం OpenAI క్లయింట్‌ని ఉపయోగించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
client = OpenAI(base_url=<span class="hljs-string">&quot;http://localhost:8080/v1&quot;</span>, api_key=<span class="hljs-string">&quot;sk-no-key-required&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;smollm2-1.7b-instruct&quot;</span>,
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Nucleus sampling probability</span>
frequency_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Reduce repetition of frequent tokens</span>
presence_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Reduce repetition by penalizing tokens already present</span>
max_tokens=<span class="hljs-number">200</span>, <span class="hljs-comment"># Maximum generation length</span>
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-zxv8zf">అదనంగా, llama.cpp native library తో మరింత నియంత్రణ పొందవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Using llama-cpp-python package for direct model access</span>
<span class="hljs-keyword">from</span> llama_cpp <span class="hljs-keyword">import</span> Llama
<span class="hljs-comment"># Load the model</span>
llm = Llama(
model_path=<span class="hljs-string">&quot;smollm2-1.7b-instruct.Q4_K_M.gguf&quot;</span>,
n_ctx=<span class="hljs-number">4096</span>, <span class="hljs-comment"># Context window size</span>
n_threads=<span class="hljs-number">8</span>, <span class="hljs-comment"># CPU threads</span>
n_gpu_layers=<span class="hljs-number">0</span>, <span class="hljs-comment"># GPU layers (0 = CPU only)</span>
)
<span class="hljs-comment"># Format prompt according to the model&#x27;s expected format</span>
prompt = <span class="hljs-string">&quot;&quot;&quot;&lt;|im_start|&gt;system
You are a creative storyteller.
&lt;|im_end|&gt;
&lt;|im_start|&gt;user
Write a creative story
&lt;|im_end|&gt;
&lt;|im_start|&gt;assistant
&quot;&quot;&quot;</span>
<span class="hljs-comment"># Generate response with precise parameter control</span>
output = llm(
prompt,
max_tokens=<span class="hljs-number">200</span>,
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
frequency_penalty=<span class="hljs-number">0.5</span>,
presence_penalty=<span class="hljs-number">0.5</span>,
stop=[<span class="hljs-string">&quot;&lt;|im_end|&gt;&quot;</span>],
)
<span class="hljs-built_in">print</span>(output[<span class="hljs-string">&quot;choices&quot;</span>][<span class="hljs-number">0</span>][<span class="hljs-string">&quot;text&quot;</span>])<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;vllm&quot; label=&quot;vLLM&quot;&gt;
<p data-svelte-h="svelte-1pwqod3">vLLM తో అధునాతన ఉపయోగం కోసం, మీరు InferenceClient ని ఉపయోగించవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient
client = InferenceClient(model=<span class="hljs-string">&quot;http://localhost:8000/v1&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat_completion(
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>,
max_tokens=<span class="hljs-number">200</span>,
top_p=<span class="hljs-number">0.95</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)
<span class="hljs-comment"># For direct text generation</span>
response = client.text_generation(
<span class="hljs-string">&quot;Write a creative story about space exploration&quot;</span>,
max_new_tokens=<span class="hljs-number">200</span>,
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
details=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hbl55w">మీరు OpenAI client కూడా ఉపయోగించవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
client = OpenAI(base_url=<span class="hljs-string">&quot;http://localhost:8000/v1&quot;</span>, api_key=<span class="hljs-string">&quot;not-needed&quot;</span>)
<span class="hljs-comment"># Advanced parameters example</span>
response = client.chat.completions.create(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-360M-Instruct&quot;</span>,
messages=[
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
],
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
max_tokens=<span class="hljs-number">200</span>,
)
<span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7oc2vp">vLLM లో స్థానిక పైథాన్ interface కూడా ఉంది:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> vllm <span class="hljs-keyword">import</span> LLM, SamplingParams
<span class="hljs-comment"># Initialize the model with advanced parameters</span>
llm = LLM(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-360M-Instruct&quot;</span>,
gpu_memory_utilization=<span class="hljs-number">0.85</span>,
max_num_batched_tokens=<span class="hljs-number">8192</span>,
max_num_seqs=<span class="hljs-number">256</span>,
block_size=<span class="hljs-number">16</span>,
)
<span class="hljs-comment"># Configure sampling parameters</span>
sampling_params = SamplingParams(
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span>
max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span>
presence_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span>
frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span>
stop=[<span class="hljs-string">&quot;\n\n&quot;</span>, <span class="hljs-string">&quot;###&quot;</span>], <span class="hljs-comment"># Stop sequences</span>
)
<span class="hljs-comment"># Generate text</span>
prompt = <span class="hljs-string">&quot;Write a creative story&quot;</span>
outputs = llm.generate(prompt, sampling_params)
<span class="hljs-built_in">print</span>(outputs[<span class="hljs-number">0</span>].outputs[<span class="hljs-number">0</span>].text)
<span class="hljs-comment"># For chat-style interactions</span>
chat_prompt = [
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a creative storyteller.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;Write a creative story&quot;</span>},
]
formatted_prompt = llm.get_chat_template()(chat_prompt) <span class="hljs-comment"># Uses model&#x27;s chat template</span>
outputs = llm.generate(formatted_prompt, sampling_params)
<span class="hljs-built_in">print</span>(outputs[<span class="hljs-number">0</span>].outputs[<span class="hljs-number">0</span>].text)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;</div> <h2 class="relative group"><a id="అధనతన-జనరషన-నయతరణ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#అధనతన-జనరషన-నయతరణ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>అధునాతన జనరేషన్ నియంత్రణ</span></h2> <h3 class="relative group"><a id="token-ఎపక-మరయ-sampling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#token-ఎపక-మరయ-sampling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Token ఎంపిక మరియు Sampling</span></h3> <p data-svelte-h="svelte-7ig94u">టెక్స్ట్‌ను జనరేట్ చేసే ప్రక్రియలో, ప్రతి దశలో వచ్చే తదుపరి token ను ఎంపిక చేయాలి. ఈ ఎంపికను పలు నియంత్రణ పరామితుల ద్వారా ప్రభావితం చేయవచ్చు:</p> <ol data-svelte-h="svelte-1o46gpg"><li><strong>Raw Logits</strong>: ప్రతి token కోసం మోడల్ ఇచ్చే ప్రారంభ probability విలువలు</li> <li><strong>Temperature</strong>: యాదృచ్ఛికతను నియంత్రిస్తుంది (విలువ ఎక్కువైతే output మరింత creative గా ఉంటుంది)</li> <li><strong>Top-p (Nucleus) Sampling</strong>: మొత్తం probability లో X% వచ్చే వరకు ఉన్న అత్యుత్తమ tokens ను మాత్రమే పరిగణలోకి తీసుకోవడం</li> <li><strong>Top-k Filtering</strong>: అత్యంత సాధ్యమైన k tokens కు selection ను పరిమితం చేయడం</li></ol> <p data-svelte-h="svelte-1246drw">ఈ పరామితులను ఎలా సెట్ చేయాలో ఇక్కడ చూపుతున్నాం:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select">&lt;hfoption value=&quot;tgi&quot; label=&quot;TGI&quot;&gt;
<div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->client.generate(
<span class="hljs-string">&quot;Write a creative story&quot;</span>,
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span>
top_k=<span class="hljs-number">50</span>, <span class="hljs-comment"># Consider top 50 tokens</span>
max_new_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span>
repetition_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span>
)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;llama.cpp&quot; label=&quot;llama.cpp&quot;&gt;
<div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Via OpenAI API compatibility</span>
response = client.completions.create(
model=<span class="hljs-string">&quot;smollm2-1.7b-instruct&quot;</span>, <span class="hljs-comment"># Model name (can be any string for llama.cpp server)</span>
prompt=<span class="hljs-string">&quot;Write a creative story&quot;</span>,
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span>
frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span>
presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Reduce repetition</span>
max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span>
)
<span class="hljs-comment"># Via llama-cpp-python direct access</span>
output = llm(
<span class="hljs-string">&quot;Write a creative story&quot;</span>,
temperature=<span class="hljs-number">0.8</span>,
top_p=<span class="hljs-number">0.95</span>,
top_k=<span class="hljs-number">50</span>,
max_tokens=<span class="hljs-number">100</span>,
repeat_penalty=<span class="hljs-number">1.1</span>,
)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;vllm&quot; label=&quot;vLLM&quot;&gt;
<div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams(
temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span>
top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span>
top_k=<span class="hljs-number">50</span>, <span class="hljs-comment"># Consider top 50 tokens</span>
max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span>
presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Reduce repetition</span>
)
llm.generate(<span class="hljs-string">&quot;Write a creative story&quot;</span>, sampling_params=params)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;</div> <h3 class="relative group"><a id="పనరవత-నవరణ-controlling-repetition" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#పనరవత-నవరణ-controlling-repetition"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>పునరావృతం నివారణ (Controlling Repetition)</span></h3> <p data-svelte-h="svelte-12vnxln">పునరావృతమైన లేదా ఒకే విధమైన టెక్స్ట్‌ను మోడల్ నిరంతరం ఉత్పత్తి చేయకుండా నిలువరించడానికి frameworks నియంత్రణ పద్ధతులు అందిస్తాయి:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select">&lt;hfoption value=&quot;tgi&quot; label=&quot;TGI&quot;&gt;
<div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->client.generate(
<span class="hljs-string">&quot;Write a varied text&quot;</span>,
repetition_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalize repeated tokens</span>
no_repeat_ngram_size=<span class="hljs-number">3</span>, <span class="hljs-comment"># Prevent 3-gram repetition</span>
)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;llama.cpp&quot; label=&quot;llama.cpp&quot;&gt;
<div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Via OpenAI API</span>
response = client.completions.create(
model=<span class="hljs-string">&quot;smollm2-1.7b-instruct&quot;</span>,
prompt=<span class="hljs-string">&quot;Write a varied text&quot;</span>,
frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalize frequent tokens</span>
presence_penalty=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Penalize tokens already present</span>
)
<span class="hljs-comment"># Via direct library</span>
output = llm(
<span class="hljs-string">&quot;Write a varied text&quot;</span>,
repeat_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalize repeated tokens</span>
frequency_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Additional frequency penalty</span>
presence_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Additional presence penalty</span>
)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;vllm&quot; label=&quot;vLLM&quot;&gt;
<div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams(
presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Penalize token presence</span>
frequency_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Penalize token frequency</span>
)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;</div> <h3 class="relative group"><a id="పడవ-నయతరణ-మరయ-stop-sequences" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#పడవ-నయతరణ-మరయ-stop-sequences"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>పొడవు నియంత్రణ మరియు Stop Sequences</span></h3> <p data-svelte-h="svelte-1x4l60u">జనరేట్ చేసే టెక్స్ట్ ఎంత పొడవు ఉండాలి, ఎప్పుడు generation ఆగాలి అనేదాన్ని కూడా నియంత్రించవచ్చు:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select">&lt;hfoption value=&quot;tgi&quot; label=&quot;TGI&quot;&gt;
<div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->client.generate(
<span class="hljs-string">&quot;Generate a short paragraph&quot;</span>,
max_new_tokens=<span class="hljs-number">100</span>,
min_new_tokens=<span class="hljs-number">10</span>,
stop_sequences=[<span class="hljs-string">&quot;\n\n&quot;</span>, <span class="hljs-string">&quot;###&quot;</span>],
)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;llama.cpp&quot; label=&quot;llama.cpp&quot;&gt;
<div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Via OpenAI API</span>
response = client.completions.create(
model=<span class="hljs-string">&quot;smollm2-1.7b-instruct&quot;</span>,
prompt=<span class="hljs-string">&quot;Generate a short paragraph&quot;</span>,
max_tokens=<span class="hljs-number">100</span>,
stop=[<span class="hljs-string">&quot;\n\n&quot;</span>, <span class="hljs-string">&quot;###&quot;</span>],
)
<span class="hljs-comment"># Via direct library</span>
output = llm(<span class="hljs-string">&quot;Generate a short paragraph&quot;</span>, max_tokens=<span class="hljs-number">100</span>, stop=[<span class="hljs-string">&quot;\n\n&quot;</span>, <span class="hljs-string">&quot;###&quot;</span>])<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;vllm&quot; label=&quot;vLLM&quot;&gt;
<div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams(
max_tokens=<span class="hljs-number">100</span>,
min_tokens=<span class="hljs-number">10</span>,
stop=[<span class="hljs-string">&quot;###&quot;</span>, <span class="hljs-string">&quot;\n\n&quot;</span>],
ignore_eos=<span class="hljs-literal">False</span>,
skip_special_tokens=<span class="hljs-literal">True</span>,
)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;</div> <h2 class="relative group"><a id="మమర-నరవహణ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#మమర-నరవహణ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>మెమరీ నిర్వహణ</span></h2> <p data-svelte-h="svelte-16gmr3r">సమర్థవంతమైన inference కోసం ఈ frameworks అన్నీ అభివృద్ధి చెందిన మెమరీ మేనేజ్‌మెంట్ విధానాలను ఉపయోగిస్తాయి.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select">&lt;hfoption value=&quot;tgi&quot; label=&quot;TGI&quot;&gt;
<p data-svelte-h="svelte-1i9s29h">TGI, Flash Attention 2 మరియు continuous batching ను ఉపయోగిస్తుంది:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Docker deployment with memory optimization</span>
docker run --gpus all -p 8080:80 \
--shm-size 1g \
ghcr.io/huggingface/text-generation-inference:latest \
--model-id HuggingFaceTB/SmolLM2-1.7B-Instruct \
--max-batch-total-tokens 8192 \
--max-input-length 4096<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;llama.cpp&quot; label=&quot;llama.cpp&quot;&gt;
<p data-svelte-h="svelte-f1gk1a">llama.cpp లో quantization మరియు optimized memory layout వాడుతుంది:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Server with memory optimizations</span>
./server \
-m smollm2-1.7b-instruct.Q4_K_M.gguf \
--host 0.0.0.0 \
--port 8080 \
-c 2048 \ <span class="hljs-comment"># Context size</span>
--threads 4 \ <span class="hljs-comment"># CPU threads</span>
--n-gpu-layers 32 \ <span class="hljs-comment"># Use more GPU layers for larger models</span>
--mlock \ <span class="hljs-comment"># Lock memory to prevent swapping</span>
--cont-batching <span class="hljs-comment"># Enable continuous batching</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1cixy12">మీ GPU కి చాలా పెద్దగా ఉన్న మోడళ్ల కోసం CPU offloading ను ఉపయోగించవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->./server \
-m smollm2-1.7b-instruct.Q4_K_M.gguf \
--n-gpu-layers 20 \ <span class="hljs-comment"># Keep first 20 layers on GPU</span>
--threads 8 <span class="hljs-comment"># Use more CPU threads for CPU layers</span><!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;
&lt;hfoption value=&quot;vllm&quot; label=&quot;vLLM&quot;&gt;
<p data-svelte-h="svelte-gk2ujy">vLLM, మెమరీని అత్యంత సమర్థవంతంగా నిర్వహించేందుకు PagedAttention ను ఉపయోగిస్తుంది:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> vllm.engine.arg_utils <span class="hljs-keyword">import</span> AsyncEngineArgs
engine_args = AsyncEngineArgs(
model=<span class="hljs-string">&quot;HuggingFaceTB/SmolLM2-1.7B-Instruct&quot;</span>,
gpu_memory_utilization=<span class="hljs-number">0.85</span>,
max_num_batched_tokens=<span class="hljs-number">8192</span>,
block_size=<span class="hljs-number">16</span>,
)
llm = LLM(engine_args=engine_args)<!-- HTML_TAG_END --></pre></div>
&lt;/hfoption&gt;</div> <h2 class="relative group"><a id="వనరల-resources" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#వనరల-resources"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>వనరులు (Resources)</span></h2> <ul data-svelte-h="svelte-15h1dzu"><li><a href="https://huggingface.co/docs/text-generation-inference" rel="nofollow">Text Generation Inference Documentation</a></li> <li><a href="https://github.com/huggingface/text-generation-inference" rel="nofollow">TGI GitHub Repository</a></li> <li><a href="https://vllm.readthedocs.io/" rel="nofollow">vLLM Documentation</a></li> <li><a href="https://github.com/vllm-project/vllm" rel="nofollow">vLLM GitHub Repository</a></li> <li><a href="https://arxiv.org/abs/2309.06180" rel="nofollow">PagedAttention Paper</a></li> <li><a href="https://github.com/ggerganov/llama.cpp" rel="nofollow">llama.cpp GitHub Repository</a></li> <li><a href="https://github.com/abetlen/llama-cpp-python" rel="nofollow">llama-cpp-python Repository</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/te/chapter2/8.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1e95xo9 = {
assets: "/docs/course/pr_1149/te",
base: "/docs/course/pr_1149/te",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1149/te/_app/immutable/entry/start.eee4c1e0.js"),
import("/docs/course/pr_1149/te/_app/immutable/entry/app.0f38558a.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 21],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
119 kB
·
Xet hash:
6e9df99e5f4b6818760e9acdd3bef412528f7c4b86086b0bc0e02ef3d0cb1185

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.