Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"ఆప్టిమైజ్డ్ ఇన్ఫరెన్స్ డిప్లాయ్మెంట్","local":"ఆపటమజడ-ఇనఫరనస-డపలయమట","sections":[{"title":"Framework ఎంపిక గైడ్","local":"framework-ఎపక-గడ","sections":[{"title":"మెమరీ మేనేజ్మెంట్ మరియు పనితీరు","local":"మమర-మనజమట-మరయ-పనతర","sections":[],"depth":3},{"title":"Deployment మరియు Integration","local":"deployment-మరయ-integration","sections":[],"depth":3}],"depth":2},{"title":"ప్రారంభించడం","local":"పరరభచడ","sections":[{"title":"సంస్థాపన మరియు ప్రాథమిక సెటప్","local":"ససథపన-మరయ-పరథమక-సటప","sections":[],"depth":3},{"title":"ప్రాథమిక టెక్స్ట్ జనరేషన్","local":"పరథమక-టకసట-జనరషన","sections":[],"depth":3}],"depth":2},{"title":"అధునాతన జనరేషన్ నియంత్రణ","local":"అధనతన-జనరషన-నయతరణ","sections":[{"title":"Token ఎంపిక మరియు Sampling","local":"token-ఎపక-మరయ-sampling","sections":[],"depth":3},{"title":"పునరావృతం నివారణ (Controlling Repetition)","local":"పనరవత-నవరణ-controlling-repetition","sections":[],"depth":3},{"title":"పొడవు నియంత్రణ మరియు Stop Sequences","local":"పడవ-నయతరణ-మరయ-stop-sequences","sections":[],"depth":3}],"depth":2},{"title":"మెమరీ నిర్వహణ","local":"మమర-నరవహణ","sections":[],"depth":2},{"title":"వనరులు (Resources)","local":"వనరల-resources","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/course/pr_1149/te/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/entry/start.eee4c1e0.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/scheduler.cc52f4b9.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/singletons.69904a63.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/index.5033808a.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/paths.e8418a83.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/entry/app.0f38558a.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/preload-helper.ae923e55.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/index.bd400c31.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/nodes/0.bd814f6d.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/nodes/21.568818e3.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/Tip.272e6bd8.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.25fe1ea6.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/CodeBlock.1794e33c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1149/te/_app/immutable/chunks/stores.ac56eab6.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"ఆప్టిమైజ్డ్ ఇన్ఫరెన్స్ డిప్లాయ్మెంట్","local":"ఆపటమజడ-ఇనఫరనస-డపలయమట","sections":[{"title":"Framework ఎంపిక గైడ్","local":"framework-ఎపక-గడ","sections":[{"title":"మెమరీ మేనేజ్మెంట్ మరియు పనితీరు","local":"మమర-మనజమట-మరయ-పనతర","sections":[],"depth":3},{"title":"Deployment మరియు Integration","local":"deployment-మరయ-integration","sections":[],"depth":3}],"depth":2},{"title":"ప్రారంభించడం","local":"పరరభచడ","sections":[{"title":"సంస్థాపన మరియు ప్రాథమిక సెటప్","local":"ససథపన-మరయ-పరథమక-సటప","sections":[],"depth":3},{"title":"ప్రాథమిక టెక్స్ట్ జనరేషన్","local":"పరథమక-టకసట-జనరషన","sections":[],"depth":3}],"depth":2},{"title":"అధునాతన జనరేషన్ నియంత్రణ","local":"అధనతన-జనరషన-నయతరణ","sections":[{"title":"Token ఎంపిక మరియు Sampling","local":"token-ఎపక-మరయ-sampling","sections":[],"depth":3},{"title":"పునరావృతం నివారణ (Controlling Repetition)","local":"పనరవత-నవరణ-controlling-repetition","sections":[],"depth":3},{"title":"పొడవు నియంత్రణ మరియు Stop Sequences","local":"పడవ-నయతరణ-మరయ-stop-sequences","sections":[],"depth":3}],"depth":2},{"title":"మెమరీ నిర్వహణ","local":"మమర-నరవహణ","sections":[],"depth":2},{"title":"వనరులు (Resources)","local":"వనరల-resources","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="ఆపటమజడ-ఇనఫరనస-డపలయమట" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ఆపటమజడ-ఇనఫరనస-డపలయమట"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ఆప్టిమైజ్డ్ ఇన్ఫరెన్స్ డిప్లాయ్మెంట్</span></h1> <p data-svelte-h="svelte-s9n4v7">ఈ విభాగంలో, LLM deployments ను ఆప్టిమైజ్ చేయడానికి ఉపయోగించే ఆధునిక frameworks అయిన Text Generation Inference (TGI), vLLM, మరియు llama.cpp గురించి తెలుసుకుందాం.<br> | |
| ఈ అప్లికేషన్లు ముఖ్యంగా production పరిసరాల్లో LLMలను వినియోగదారులకు సర్వ్ చేయడానికి ఉపయోగిస్తారు. ఈ విభాగం ఒకే మెషిన్పై inference ఎలా చేయాలో కాదు, production లో ఈ frameworks ను ఎలా deploy చేయాలో మీద దృష్టి పెడుతుంది.</p> <p data-svelte-h="svelte-9t17eo">ఈ tools inference సామర్థ్యాన్ని ఎలా గరిష్టం చేస్తాయి మరియు Large Language Models యొక్క production deployments ను ఎలా సులభతరం చేస్తాయి అనే విషయాలపై చర్చిస్తాం.</p> <h2 class="relative group"><a id="framework-ఎపక-గడ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#framework-ఎపక-గడ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Framework ఎంపిక గైడ్</span></h2> <p data-svelte-h="svelte-pgbai6">TGI, vLLM, మరియు llama.cpp ఒకే విధమైన లక్ష్యంతో పనిచేస్తున్నా, ప్రతి framework కి వేర్వేరు లక్షణాలు ఉన్నాయి, ఇవి ప్రత్యేక use cases కు బాగా సరిపోతాయి.<br> | |
| ఇప్పుడు, వాటి ప్రధాన తేడాలను — ముఖ్యంగా performance మరియు integration పరంగా — పరిశీలిద్దాం.</p> <h3 class="relative group"><a id="మమర-మనజమట-మరయ-పనతర" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#మమర-మనజమట-మరయ-పనతర"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>మెమరీ మేనేజ్మెంట్ మరియు పనితీరు</span></h3> <p data-svelte-h="svelte-1hkgjet"><strong>TGI</strong> production కోసం స్థిరమైన, predictable ప్రవర్తన అందించేలా రూపొందించబడింది. ఇది GPU మెమరీ వినియోగాన్ని స్థిరంగా ఉంచేందుకు fixed sequence lengths ఉపయోగిస్తుంది.<br> | |
| TGI, Flash Attention 2 మరియు continuous batching వంటి పద్ధతులతో మెమరీని సమర్థవంతంగా నిర్వహిస్తుంది. దీనివల్ల attention calculations వేగవంతమవుతాయి మరియు GPU నిరుత్సాహంగా ఉండే సమయం తగ్గుతుంది.<br> | |
| అవసరమైతే, సిస్టమ్ మోడల్లోని కొన్ని భాగాలను CPU మరియు GPU మధ్య మార్చగలదు, ఇది పెద్ద మోడళ్లను కూడా హ్యాండిల్ చేయడానికి సహాయపడుతుంది.</p> <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png" alt="Flash Attention"> <blockquote class="tip"><p data-svelte-h="svelte-1g4f572">Flash Attention అనేది transformer models లోని attention mechanism ను ఆప్టిమైజ్ చేయడానికి ఉపయోగించే సాంకేతికత.<br> | |
| మనం <a href="/course/chapter1/8">అధ్యాయం 1.8</a> లో చర్చించినట్టుగా, attention కి quadratic complexity ఉండటం వలన, దీని computation మరియు memory వినియోగం ఎక్కువగా ఉంటుంది.</p> <p data-svelte-h="svelte-5pvn3e">Flash Attention యొక్క ప్రధాన ఆవిష్కరణ HBM (High Bandwidth Memory) మరియు SRAM cache మధ్య memory transfer ను తగ్గించడంలో ఉంది. సాధారణ attention లో ఈ transfers చాలా సార్లు జరుగుతూ bottleneck అవుతాయి.<br> | |
| Flash Attention డేటాను ఒక్కసారి SRAM లోకి లోడ్ చేసి, calculations అంతా అక్కడే పూర్తి చేస్తుంది, దాంతో memory overhead చాలా తగ్గుతుంది.</p> <p data-svelte-h="svelte-716x3h">ఈ ప్రయోజనాలు training సమయంలో ఎక్కువగా కనిపించినా, inference సమయంలో కూడా VRAM వినియోగం తగ్గడం మరియు వేగం పెరగడం వంటి ప్రయోజనాలు అందిస్తుంది.</p></blockquote> <p data-svelte-h="svelte-7jk8pa"><strong>vLLM</strong> పూర్తిగా వేరు విధానాన్ని అనుసరిస్తుంది — దీని ప్రత్యేకత <em>PagedAttention</em>.<br> | |
| ఇది కంప్యూటర్ virtual memory లానే పనిచేస్తుంది: మోడల్ memory ను చిన్న చిన్న “pages” గా విభజిస్తుంది. దీని ద్వారా requests వేర్వేరు sizes అయినా memory ని వృథా చేయకుండా హ్యాండిల్ చేయవచ్చు.<br> | |
| ఇది memory fragmentation ను తగ్గిస్తూ, KV cache ను సమర్థవంతంగా నిర్వహిస్తుంది, తద్వారా throughput భారీగా పెరుగుతుంది.</p> <blockquote class="tip"><p data-svelte-h="svelte-1yldomh">PagedAttention, KV cache నిర్వహణలో ఉండే bottlenecks ను పరిష్కరించడానికి రూపొందించబడింది.<br> | |
| LLM generation సమయంలో, ప్రతి token కి keys మరియు values (KV cache) నిల్వ చేయాలి. దీని memory చాలా పెద్దది అవుతుంది — ముఖ్యంగా long sequences లేదా concurrent requests వద్ద.</p> <p data-svelte-h="svelte-1r7ng17">vLLM యొక్క కీలక ఆవిష్కరణలు ఇవి:</p> <ol data-svelte-h="svelte-1nel57u"><li><strong>Memory Paging</strong> – KV cache ను పెద్ద continuous block లాగా కాకుండా చిన్న pages గా విభజించడం</li> <li><strong>Non-contiguous Storage</strong> – GPU memory లో pages continuous గా ఉండాల్సిన అవసరం లేదు</li> <li><strong>Page Table</strong> – ఏ pages ఏ sequence కు చెందినవి అనే సమాచారం నిర్వహించడం</li> <li><strong>Memory Sharing</strong> – ఒక prompt కి సంబంధించిన KV cache pages ని అనేక sequences మధ్య share చేయడం</li></ol> <p data-svelte-h="svelte-1pwyoq">ఈ పద్ధతి సంప్రదాయ inference పద్ధతుల కంటే <em>24x వరకు</em> ఎక్కువ throughput అందించగలదు.<br> | |
| వివరంగా తెలుసుకోవాలంటే, <a href="https://docs.vllm.ai/en/latest/design/kernel/paged_attention.html" rel="nofollow">vLLM documentation</a> చదవవచ్చు.</p></blockquote> <p data-svelte-h="svelte-7emzsq"><strong>llama.cpp</strong> అనేది అత్యంత ఆప్టిమైజ్ చేయబడిన C/C++ implementation. మొదట ఇది consumer hardware పై LLaMA మోడళ్లను రన్ చేయడానికి రూపొందించబడింది.<br> | |
| ఇది CPU పై అత్యంత సమర్థవంతంగా పనిచేస్తుంది, అవసరమైతే GPU acceleration కూడా అందిస్తుంది.<br> | |
| llama.cpp మోడల్ ను quantize చేసి చిన్న పరిమాణంలోకి మార్చుతుంది — అలా VRAM వినియోగం తగ్గి inference వేగం పెరుగుతుంది.</p> <blockquote class="tip"><p data-svelte-h="svelte-1x0yrvo">Quantization అనేది మోడల్ weights ను FP32/FP16 నుండి తక్కువ precision (INT8, 4-bit మొదలైనవి) కు మార్చే ప్రక్రియ.<br> | |
| ఇది memory వినియోగాన్ని గణనీయంగా తగ్గించి inference వేగాన్ని పెంచుతుంది., accuracy లో తక్కువ నష్టం మాత్రమే ఉంటుంది.</p> <p data-svelte-h="svelte-h1xlnc">llama.cpp లోని ముఖ్యమైన quantization ప్రయోజనాలు:</p> <ol data-svelte-h="svelte-y1mhpg"><li><strong>అనేక precision స్థాయిలు</strong> – INT8, 4-bit, 3-bit, 2-bit వరకు</li> <li><strong>GGML/GGUF ఫార్మాట్లు</strong> – quantized inference కోసం ఆప్టిమైజ్ చేసిన టెన్సర్ ఫార్మాట్లు</li> <li><strong>Mixed precision</strong> – మోడల్లో వేర్వేరు భాగాలకు వేర్వేరు quantization స్థాయిలు</li> <li><strong>CPU optimizations</strong> – AVX2, AVX-512, NEON వంటి CPU నిర్మాణాల (architectures) కోసం ఆప్టిమైజ్ చేసిన kernels</li></ol> <p data-svelte-h="svelte-1ov3ssy">ఈ విధానం, తక్కువ memory ఉన్న consumer devices పై కూడా పెద్ద మోడళ్లను రన్ చేయడానికి మార్గాన్ని అందిస్తుంది.</p></blockquote> <h3 class="relative group"><a id="deployment-మరయ-integration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deployment-మరయ-integration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deployment మరియు Integration</span></h3> <p data-svelte-h="svelte-99dvxk">ఇప్పుడు frameworks deployment మరియు integration పరంగా ఎలా భిన్నంగా ఉన్నాయో చూద్దాం.</p> <p data-svelte-h="svelte-drewzd"><strong>TGI</strong> enterprise-స్థాయి deployment లో అత్యుత్తమం.<br> | |
| ఇది production కి అవసరమైన వాటిని built-in గా అందిస్తుంది — Kubernetes support, monitoring (Prometheus/Grafana), autoscaling, content filtering, rate limiting, security features మొదలైనవి.<br> | |
| అంతేకాకుండా enterprise-grade logging కూడా కలిగి ఉంది.</p> <p data-svelte-h="svelte-eauoi0"><strong>vLLM</strong> flexible మరియు developer-friendly గా రూపొందించబడింది.<br> | |
| ఇది Python ఆధారంగా పనిచేస్తుంది మరియు మీ existing applications లో OpenAI API స్థానంలో సులభంగా plug చేయవచ్చు.<br> | |
| Clusters నిర్వహణ కోసం Ray తో బాగా పని చేస్తుంది.</p> <p data-svelte-h="svelte-mbxtpt"><strong>llama.cpp</strong> సాదాసీదా, తేలికైన server implementation కలిగి ఉంది.<br> | |
| Python frameworks ను install చేయడం కష్టమైన పరిసరాల్లో కూడా deployment సులభం.<br> | |
| ఇది OpenAI-compatible API కూడా అందిస్తుంది, కానీ resource వినియోగం చాలా తక్కువ.</p> <h2 class="relative group"><a id="పరరభచడ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#పరరభచడ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ప్రారంభించడం</span></h2> <p data-svelte-h="svelte-14g2ual">ఇప్పుడు ఈ frameworks ను ఎలా ఉపయోగించాలో — సంస్థాపన (installation) నుండి deployment వరకూ — చూద్దాం.</p> <h3 class="relative group"><a id="ససథపన-మరయ-పరథమక-సటప" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ససథపన-మరయ-పరథమక-సటప"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>సంస్థాపన మరియు ప్రాథమిక సెటప్</span></h3> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <p data-svelte-h="svelte-16pya07">TGI ను Hugging Face ecosystem తో బాగా integrate చేశారు, మరియు సంస్థాపన చాలా సులభం.</p> <p data-svelte-h="svelte-cv57m1">మొదట, Docker తో TGI server ను ప్రారంభించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->docker run --gpus all \ | |
| --shm-size 1g \ | |
| -p 8080:80 \ | |
| -v ~/.cache/huggingface:/data \ | |
| ghcr.io/huggingface/text-generation-inference:latest \ | |
| --model-id HuggingFaceTB/SmolLM2-360M-Instruct<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1gzvg67">తరువాత Hugging Face InferenceClient తో ఇంటరాక్ట్ అవండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| <span class="hljs-comment"># Initialize client pointing to TGI endpoint</span> | |
| client = InferenceClient( | |
| model=<span class="hljs-string">"http://localhost:8080"</span>, <span class="hljs-comment"># URL to the TGI server</span> | |
| ) | |
| <span class="hljs-comment"># Text generation</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Tell me a story"</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| stop_sequences=[], | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text) | |
| <span class="hljs-comment"># For chat format</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-o8nlfq">లేదా OpenAI client ఉపయోగించవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| <span class="hljs-comment"># Initialize client pointing to TGI endpoint</span> | |
| client = OpenAI( | |
| base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, <span class="hljs-comment"># Make sure to include /v1</span> | |
| api_key=<span class="hljs-string">"not-needed"</span>, <span class="hljs-comment"># TGI doesn't require an API key by default</span> | |
| ) | |
| <span class="hljs-comment"># Chat completion</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <p data-svelte-h="svelte-ci9eva">llama.cpp సంస్థాపన చాలా తేలికగా ఉంటుంది; CPU మరియు GPU inference రెండింటిని సపోర్ట్ చేస్తుంది.</p> <p data-svelte-h="svelte-t6wddc">మొదట, llama.cpp ని build చేయండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Clone the repository</span> | |
| git <span class="hljs-built_in">clone</span> https://github.com/ggerganov/llama.cpp | |
| <span class="hljs-built_in">cd</span> llama.cpp | |
| <span class="hljs-comment"># Build the project</span> | |
| make | |
| <span class="hljs-comment"># Download the SmolLM2-1.7B-Instruct-GGUF model</span> | |
| curl -L -O https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF/resolve/main/smollm2-1.7b-instruct.Q4_K_M.gguf<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-rlmx1m">OpenAI-compatible server ను ప్రారంభించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Start the server</span> | |
| ./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --host 0.0.0.0 \ | |
| --port 8080 \ | |
| -c 4096 \ | |
| --n-gpu-layers 0 <span class="hljs-comment"># Set to a higher number to use GPU</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-123ki15">InferenceClient తో interact చేయండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| <span class="hljs-comment"># Initialize client pointing to llama.cpp server</span> | |
| client = InferenceClient( | |
| model=<span class="hljs-string">"http://localhost:8080/v1"</span>, <span class="hljs-comment"># URL to the llama.cpp server</span> | |
| token=<span class="hljs-string">"sk-no-key-required"</span>, <span class="hljs-comment"># llama.cpp server requires this placeholder</span> | |
| ) | |
| <span class="hljs-comment"># Text generation</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Tell me a story"</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text) | |
| <span class="hljs-comment"># For chat format</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1app4cd">లేదా OpenAI client ఉపయోగించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| <span class="hljs-comment"># Initialize client pointing to llama.cpp server</span> | |
| client = OpenAI( | |
| base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, | |
| api_key=<span class="hljs-string">"sk-no-key-required"</span>, <span class="hljs-comment"># llama.cpp server requires this placeholder</span> | |
| ) | |
| <span class="hljs-comment"># Chat completion</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, <span class="hljs-comment"># Model identifier can be anything as server only loads one model</span> | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <p data-svelte-h="svelte-1jetjxb">vLLM సంస్థాపన కూడా అత్యంత సులభం. ఇది OpenAI-compatible API మరియు native Python interface రెండింటినీ అందిస్తుంది.</p> <p data-svelte-h="svelte-1n9fznx">మొదట, vLLM server ను ప్రారంభించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python -m vllm.entrypoints.openai.api_server \ | |
| --model HuggingFaceTB/SmolLM2-360M-Instruct \ | |
| --host 0.0.0.0 \ | |
| --port 8000<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1eiitrc">InferenceClient తో ఉపయోగించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| <span class="hljs-comment"># Initialize client pointing to vLLM endpoint</span> | |
| client = InferenceClient( | |
| model=<span class="hljs-string">"http://localhost:8000/v1"</span>, <span class="hljs-comment"># URL to the vLLM server</span> | |
| ) | |
| <span class="hljs-comment"># Text generation</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Tell me a story"</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text) | |
| <span class="hljs-comment"># For chat format</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xb5y1g">లేదా OpenAI client:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| <span class="hljs-comment"># Initialize client pointing to vLLM endpoint</span> | |
| client = OpenAI( | |
| base_url=<span class="hljs-string">"http://localhost:8000/v1"</span>, | |
| api_key=<span class="hljs-string">"not-needed"</span>, <span class="hljs-comment"># vLLM doesn't require an API key by default</span> | |
| ) | |
| <span class="hljs-comment"># Chat completion</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h3 class="relative group"><a id="పరథమక-టకసట-జనరషన" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#పరథమక-టకసట-జనరషన"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ప్రాథమిక టెక్స్ట్ జనరేషన్</span></h3> <p data-svelte-h="svelte-ik0sww">ఇప్పుడు frameworks లో text generation ఎలా చేయాలో చూద్దాం.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <p data-svelte-h="svelte-93ghn8">మొదట, అభివృద్ధి చెందిన parameters తో TGI deploy చేయండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->docker run --gpus all \ | |
| --shm-size 1g \ | |
| -p 8080:80 \ | |
| -v ~/.cache/huggingface:/data \ | |
| ghcr.io/huggingface/text-generation-inference:latest \ | |
| --model-id HuggingFaceTB/SmolLM2-360M-Instruct \ | |
| --max-total-tokens 4096 \ | |
| --max-input-length 3072 \ | |
| --max-batch-total-tokens 8192 \ | |
| --waiting-served-ratio 1.2<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-12p45o1">InferenceClient తో generation చేయండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| client = InferenceClient(model=<span class="hljs-string">"http://localhost:8080"</span>) | |
| <span class="hljs-comment"># Advanced parameters example</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content) | |
| <span class="hljs-comment"># Raw text generation</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Write a creative story about space exploration"</span>, | |
| max_new_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| repetition_penalty=<span class="hljs-number">1.1</span>, | |
| do_sample=<span class="hljs-literal">True</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xb5y1g">లేదా OpenAI client:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| client = OpenAI(base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, api_key=<span class="hljs-string">"not-needed"</span>) | |
| <span class="hljs-comment"># Advanced parameters example</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span> | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <p data-svelte-h="svelte-1wheeko">llama.cpp లో server launch సమయంలో advanced parameters సెట్ చేయవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --host 0.0.0.0 \ | |
| --port 8080 \ | |
| -c 4096 \ <span class="hljs-comment"># Context size</span> | |
| --threads 8 \ <span class="hljs-comment"># CPU threads to use</span> | |
| --batch-size 512 \ <span class="hljs-comment"># Batch size for prompt evaluation</span> | |
| --n-gpu-layers 0 <span class="hljs-comment"># GPU layers (0 = CPU only)</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1eiitrc">InferenceClient తో ఉపయోగించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| client = InferenceClient(model=<span class="hljs-string">"http://localhost:8080/v1"</span>, token=<span class="hljs-string">"sk-no-key-required"</span>) | |
| <span class="hljs-comment"># Advanced parameters example</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content) | |
| <span class="hljs-comment"># For direct text generation</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Write a creative story about space exploration"</span>, | |
| max_new_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| repetition_penalty=<span class="hljs-number">1.1</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-11yv55v">లేదా నమూనా పారామితులపై నియంత్రణతో జనరేషన్ కోసం OpenAI క్లయింట్ని ఉపయోగించండి:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| client = OpenAI(base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, api_key=<span class="hljs-string">"sk-no-key-required"</span>) | |
| <span class="hljs-comment"># Advanced parameters example</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Nucleus sampling probability</span> | |
| frequency_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Reduce repetition of frequent tokens</span> | |
| presence_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Reduce repetition by penalizing tokens already present</span> | |
| max_tokens=<span class="hljs-number">200</span>, <span class="hljs-comment"># Maximum generation length</span> | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-zxv8zf">అదనంగా, llama.cpp native library తో మరింత నియంత్రణ పొందవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Using llama-cpp-python package for direct model access</span> | |
| <span class="hljs-keyword">from</span> llama_cpp <span class="hljs-keyword">import</span> Llama | |
| <span class="hljs-comment"># Load the model</span> | |
| llm = Llama( | |
| model_path=<span class="hljs-string">"smollm2-1.7b-instruct.Q4_K_M.gguf"</span>, | |
| n_ctx=<span class="hljs-number">4096</span>, <span class="hljs-comment"># Context window size</span> | |
| n_threads=<span class="hljs-number">8</span>, <span class="hljs-comment"># CPU threads</span> | |
| n_gpu_layers=<span class="hljs-number">0</span>, <span class="hljs-comment"># GPU layers (0 = CPU only)</span> | |
| ) | |
| <span class="hljs-comment"># Format prompt according to the model's expected format</span> | |
| prompt = <span class="hljs-string">"""<|im_start|>system | |
| You are a creative storyteller. | |
| <|im_end|> | |
| <|im_start|>user | |
| Write a creative story | |
| <|im_end|> | |
| <|im_start|>assistant | |
| """</span> | |
| <span class="hljs-comment"># Generate response with precise parameter control</span> | |
| output = llm( | |
| prompt, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| frequency_penalty=<span class="hljs-number">0.5</span>, | |
| presence_penalty=<span class="hljs-number">0.5</span>, | |
| stop=[<span class="hljs-string">"<|im_end|>"</span>], | |
| ) | |
| <span class="hljs-built_in">print</span>(output[<span class="hljs-string">"choices"</span>][<span class="hljs-number">0</span>][<span class="hljs-string">"text"</span>])<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <p data-svelte-h="svelte-1pwqod3">vLLM తో అధునాతన ఉపయోగం కోసం, మీరు InferenceClient ని ఉపయోగించవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| client = InferenceClient(model=<span class="hljs-string">"http://localhost:8000/v1"</span>) | |
| <span class="hljs-comment"># Advanced parameters example</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content) | |
| <span class="hljs-comment"># For direct text generation</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Write a creative story about space exploration"</span>, | |
| max_new_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hbl55w">మీరు OpenAI client కూడా ఉపయోగించవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| client = OpenAI(base_url=<span class="hljs-string">"http://localhost:8000/v1"</span>, api_key=<span class="hljs-string">"not-needed"</span>) | |
| <span class="hljs-comment"># Advanced parameters example</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7oc2vp">vLLM లో స్థానిక పైథాన్ interface కూడా ఉంది:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> vllm <span class="hljs-keyword">import</span> LLM, SamplingParams | |
| <span class="hljs-comment"># Initialize the model with advanced parameters</span> | |
| llm = LLM( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| gpu_memory_utilization=<span class="hljs-number">0.85</span>, | |
| max_num_batched_tokens=<span class="hljs-number">8192</span>, | |
| max_num_seqs=<span class="hljs-number">256</span>, | |
| block_size=<span class="hljs-number">16</span>, | |
| ) | |
| <span class="hljs-comment"># Configure sampling parameters</span> | |
| sampling_params = SamplingParams( | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span> | |
| max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span> | |
| presence_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span> | |
| frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span> | |
| stop=[<span class="hljs-string">"\n\n"</span>, <span class="hljs-string">"###"</span>], <span class="hljs-comment"># Stop sequences</span> | |
| ) | |
| <span class="hljs-comment"># Generate text</span> | |
| prompt = <span class="hljs-string">"Write a creative story"</span> | |
| outputs = llm.generate(prompt, sampling_params) | |
| <span class="hljs-built_in">print</span>(outputs[<span class="hljs-number">0</span>].outputs[<span class="hljs-number">0</span>].text) | |
| <span class="hljs-comment"># For chat-style interactions</span> | |
| chat_prompt = [ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ] | |
| formatted_prompt = llm.get_chat_template()(chat_prompt) <span class="hljs-comment"># Uses model's chat template</span> | |
| outputs = llm.generate(formatted_prompt, sampling_params) | |
| <span class="hljs-built_in">print</span>(outputs[<span class="hljs-number">0</span>].outputs[<span class="hljs-number">0</span>].text)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h2 class="relative group"><a id="అధనతన-జనరషన-నయతరణ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#అధనతన-జనరషన-నయతరణ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>అధునాతన జనరేషన్ నియంత్రణ</span></h2> <h3 class="relative group"><a id="token-ఎపక-మరయ-sampling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#token-ఎపక-మరయ-sampling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Token ఎంపిక మరియు Sampling</span></h3> <p data-svelte-h="svelte-7ig94u">టెక్స్ట్ను జనరేట్ చేసే ప్రక్రియలో, ప్రతి దశలో వచ్చే తదుపరి token ను ఎంపిక చేయాలి. ఈ ఎంపికను పలు నియంత్రణ పరామితుల ద్వారా ప్రభావితం చేయవచ్చు:</p> <ol data-svelte-h="svelte-1o46gpg"><li><strong>Raw Logits</strong>: ప్రతి token కోసం మోడల్ ఇచ్చే ప్రారంభ probability విలువలు</li> <li><strong>Temperature</strong>: యాదృచ్ఛికతను నియంత్రిస్తుంది (విలువ ఎక్కువైతే output మరింత creative గా ఉంటుంది)</li> <li><strong>Top-p (Nucleus) Sampling</strong>: మొత్తం probability లో X% వచ్చే వరకు ఉన్న అత్యుత్తమ tokens ను మాత్రమే పరిగణలోకి తీసుకోవడం</li> <li><strong>Top-k Filtering</strong>: అత్యంత సాధ్యమైన k tokens కు selection ను పరిమితం చేయడం</li></ol> <p data-svelte-h="svelte-1246drw">ఈ పరామితులను ఎలా సెట్ చేయాలో ఇక్కడ చూపుతున్నాం:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->client.generate( | |
| <span class="hljs-string">"Write a creative story"</span>, | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span> | |
| top_k=<span class="hljs-number">50</span>, <span class="hljs-comment"># Consider top 50 tokens</span> | |
| max_new_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span> | |
| repetition_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span> | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Via OpenAI API compatibility</span> | |
| response = client.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, <span class="hljs-comment"># Model name (can be any string for llama.cpp server)</span> | |
| prompt=<span class="hljs-string">"Write a creative story"</span>, | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span> | |
| frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Reduce repetition</span> | |
| presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Reduce repetition</span> | |
| max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span> | |
| ) | |
| <span class="hljs-comment"># Via llama-cpp-python direct access</span> | |
| output = llm( | |
| <span class="hljs-string">"Write a creative story"</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| top_k=<span class="hljs-number">50</span>, | |
| max_tokens=<span class="hljs-number">100</span>, | |
| repeat_penalty=<span class="hljs-number">1.1</span>, | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams( | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Consider top 95% probability mass</span> | |
| top_k=<span class="hljs-number">50</span>, <span class="hljs-comment"># Consider top 50 tokens</span> | |
| max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Maximum length</span> | |
| presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Reduce repetition</span> | |
| ) | |
| llm.generate(<span class="hljs-string">"Write a creative story"</span>, sampling_params=params)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h3 class="relative group"><a id="పనరవత-నవరణ-controlling-repetition" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#పనరవత-నవరణ-controlling-repetition"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>పునరావృతం నివారణ (Controlling Repetition)</span></h3> <p data-svelte-h="svelte-12vnxln">పునరావృతమైన లేదా ఒకే విధమైన టెక్స్ట్ను మోడల్ నిరంతరం ఉత్పత్తి చేయకుండా నిలువరించడానికి frameworks నియంత్రణ పద్ధతులు అందిస్తాయి:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->client.generate( | |
| <span class="hljs-string">"Write a varied text"</span>, | |
| repetition_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalize repeated tokens</span> | |
| no_repeat_ngram_size=<span class="hljs-number">3</span>, <span class="hljs-comment"># Prevent 3-gram repetition</span> | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Via OpenAI API</span> | |
| response = client.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, | |
| prompt=<span class="hljs-string">"Write a varied text"</span>, | |
| frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalize frequent tokens</span> | |
| presence_penalty=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Penalize tokens already present</span> | |
| ) | |
| <span class="hljs-comment"># Via direct library</span> | |
| output = llm( | |
| <span class="hljs-string">"Write a varied text"</span>, | |
| repeat_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalize repeated tokens</span> | |
| frequency_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Additional frequency penalty</span> | |
| presence_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Additional presence penalty</span> | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams( | |
| presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Penalize token presence</span> | |
| frequency_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Penalize token frequency</span> | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h3 class="relative group"><a id="పడవ-నయతరణ-మరయ-stop-sequences" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#పడవ-నయతరణ-మరయ-stop-sequences"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>పొడవు నియంత్రణ మరియు Stop Sequences</span></h3> <p data-svelte-h="svelte-1x4l60u">జనరేట్ చేసే టెక్స్ట్ ఎంత పొడవు ఉండాలి, ఎప్పుడు generation ఆగాలి అనేదాన్ని కూడా నియంత్రించవచ్చు:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->client.generate( | |
| <span class="hljs-string">"Generate a short paragraph"</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| min_new_tokens=<span class="hljs-number">10</span>, | |
| stop_sequences=[<span class="hljs-string">"\n\n"</span>, <span class="hljs-string">"###"</span>], | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Via OpenAI API</span> | |
| response = client.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, | |
| prompt=<span class="hljs-string">"Generate a short paragraph"</span>, | |
| max_tokens=<span class="hljs-number">100</span>, | |
| stop=[<span class="hljs-string">"\n\n"</span>, <span class="hljs-string">"###"</span>], | |
| ) | |
| <span class="hljs-comment"># Via direct library</span> | |
| output = llm(<span class="hljs-string">"Generate a short paragraph"</span>, max_tokens=<span class="hljs-number">100</span>, stop=[<span class="hljs-string">"\n\n"</span>, <span class="hljs-string">"###"</span>])<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams( | |
| max_tokens=<span class="hljs-number">100</span>, | |
| min_tokens=<span class="hljs-number">10</span>, | |
| stop=[<span class="hljs-string">"###"</span>, <span class="hljs-string">"\n\n"</span>], | |
| ignore_eos=<span class="hljs-literal">False</span>, | |
| skip_special_tokens=<span class="hljs-literal">True</span>, | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h2 class="relative group"><a id="మమర-నరవహణ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#మమర-నరవహణ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>మెమరీ నిర్వహణ</span></h2> <p data-svelte-h="svelte-16gmr3r">సమర్థవంతమైన inference కోసం ఈ frameworks అన్నీ అభివృద్ధి చెందిన మెమరీ మేనేజ్మెంట్ విధానాలను ఉపయోగిస్తాయి.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <p data-svelte-h="svelte-1i9s29h">TGI, Flash Attention 2 మరియు continuous batching ను ఉపయోగిస్తుంది:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Docker deployment with memory optimization</span> | |
| docker run --gpus all -p 8080:80 \ | |
| --shm-size 1g \ | |
| ghcr.io/huggingface/text-generation-inference:latest \ | |
| --model-id HuggingFaceTB/SmolLM2-1.7B-Instruct \ | |
| --max-batch-total-tokens 8192 \ | |
| --max-input-length 4096<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <p data-svelte-h="svelte-f1gk1a">llama.cpp లో quantization మరియు optimized memory layout వాడుతుంది:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Server with memory optimizations</span> | |
| ./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --host 0.0.0.0 \ | |
| --port 8080 \ | |
| -c 2048 \ <span class="hljs-comment"># Context size</span> | |
| --threads 4 \ <span class="hljs-comment"># CPU threads</span> | |
| --n-gpu-layers 32 \ <span class="hljs-comment"># Use more GPU layers for larger models</span> | |
| --mlock \ <span class="hljs-comment"># Lock memory to prevent swapping</span> | |
| --cont-batching <span class="hljs-comment"># Enable continuous batching</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1cixy12">మీ GPU కి చాలా పెద్దగా ఉన్న మోడళ్ల కోసం CPU offloading ను ఉపయోగించవచ్చు:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --n-gpu-layers 20 \ <span class="hljs-comment"># Keep first 20 layers on GPU</span> | |
| --threads 8 <span class="hljs-comment"># Use more CPU threads for CPU layers</span><!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <p data-svelte-h="svelte-gk2ujy">vLLM, మెమరీని అత్యంత సమర్థవంతంగా నిర్వహించేందుకు PagedAttention ను ఉపయోగిస్తుంది:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> vllm.engine.arg_utils <span class="hljs-keyword">import</span> AsyncEngineArgs | |
| engine_args = AsyncEngineArgs( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-1.7B-Instruct"</span>, | |
| gpu_memory_utilization=<span class="hljs-number">0.85</span>, | |
| max_num_batched_tokens=<span class="hljs-number">8192</span>, | |
| block_size=<span class="hljs-number">16</span>, | |
| ) | |
| llm = LLM(engine_args=engine_args)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h2 class="relative group"><a id="వనరల-resources" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#వనరల-resources"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>వనరులు (Resources)</span></h2> <ul data-svelte-h="svelte-15h1dzu"><li><a href="https://huggingface.co/docs/text-generation-inference" rel="nofollow">Text Generation Inference Documentation</a></li> <li><a href="https://github.com/huggingface/text-generation-inference" rel="nofollow">TGI GitHub Repository</a></li> <li><a href="https://vllm.readthedocs.io/" rel="nofollow">vLLM Documentation</a></li> <li><a href="https://github.com/vllm-project/vllm" rel="nofollow">vLLM GitHub Repository</a></li> <li><a href="https://arxiv.org/abs/2309.06180" rel="nofollow">PagedAttention Paper</a></li> <li><a href="https://github.com/ggerganov/llama.cpp" rel="nofollow">llama.cpp GitHub Repository</a></li> <li><a href="https://github.com/abetlen/llama-cpp-python" rel="nofollow">llama-cpp-python Repository</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/te/chapter2/8.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1e95xo9 = { | |
| assets: "/docs/course/pr_1149/te", | |
| base: "/docs/course/pr_1149/te", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/course/pr_1149/te/_app/immutable/entry/start.eee4c1e0.js"), | |
| import("/docs/course/pr_1149/te/_app/immutable/entry/app.0f38558a.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 21], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 119 kB
- Xet hash:
- 6e9df99e5f4b6818760e9acdd3bef412528f7c4b86086b0bc0e02ef3d0cb1185
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.